In [1]:
# import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
df=pd.read_csv('Telco_Customer_Churn_Dataset  (1).csv') # load the dataset
print(df.head()) # display the first few rows of the dataset

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [2]:
# check for missing values
print(df.isnull().sum())
# handle missing values
nc=df.select_dtypes(include=['float64', 'int32', 'int64']).columns # for numeric columns
df[nc]=df[nc].fillna(df[nc].mean())
cc=df.select_dtypes(include=['object']).columns # for categorical columns
df[cc]=df[cc].fillna(df[nc].mode().iloc[0])

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [3]:
# Drop rows where any of the selected columns have invalid (non-numeric) values
df = df[pd.to_numeric(df['tenure'], errors='coerce').notna()]
df = df[pd.to_numeric(df['MonthlyCharges'], errors='coerce').notna()]
df = df[pd.to_numeric(df['TotalCharges'], errors='coerce').notna()]
# encode categorical variables
lb=LabelEncoder()
bcol=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in bcol:
        df[col]=lb.fit_transform(df[col]) # label encoding for binary columns
# one-hot encoding for multi-class columns
df=pd.get_dummies(df, columns=['MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
       'StreamingTV', 'StreamingMovies', 'Contract',
       'PaymentMethod'], drop_first=True)
# scale numerical features
scaler=StandardScaler()
df[['tenure', 'MonthlyCharges', 'TotalCharges']]=scaler.fit_transform(df[['tenure', 'MonthlyCharges', 'TotalCharges']])
print(df)
print(df.info())

      customerID  gender  SeniorCitizen  Partner  Dependents    tenure  \
0     7590-VHVEG       0              0        1           0 -1.280248   
1     5575-GNVDE       1              0        0           0  0.064303   
2     3668-QPYBK       1              0        0           0 -1.239504   
3     7795-CFOCW       1              0        0           0  0.512486   
4     9237-HQITU       0              0        0           0 -1.239504   
...          ...     ...            ...      ...         ...       ...   
7038  6840-RESVB       1              0        1           1 -0.343137   
7039  2234-XADUH       0              0        1           1  1.612573   
7040  4801-JZAZL       0              0        1           1 -0.872808   
7041  8361-LTMKD       1              1        1           0 -1.158016   
7042  3186-AJIEK       1              0        0           0  1.368109   

      PhoneService  PaperlessBilling  MonthlyCharges  TotalCharges  ...  \
0                0                 1

# Conclusion: 
# 1- The dataset was preprocessed by handling missing values, encoding categorical variables and standardizing numerical features.
# 2- The dataset is clean, consistent and ready for machine learning tasks.