# Task - 1 
# Data Cleaning and Preprocessing

In [1]:
import pandas as pd
import numpy as np

## Importing DataSet

In [3]:
df=pd.read_csv('Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

## Checking Missing Values

In [5]:
missing_values=df.isnull().sum()
print("Missing values column-wise:")
print(missing_values)

Missing values column-wise:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


## Display rows with Missing Values

In [6]:
print(df[df.isnull().any(axis=1)])

Empty DataFrame
Columns: [customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Churn]
Index: []

[0 rows x 21 columns]


## Handling missing values for Numerical columns:
### Ex: Replace missing Numerical values with the mean

In [8]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce') #convert to numeric
df['TotalCharges'].fillna(df['TotalCharges'].mean(),inplace=True)

## Handling missing values for Categorical Columns:
### Ex: Replace missing Categorical values with the most frequent value(mode)

In [9]:
df['InternetService'].fillna(df['InternetService'].mode()[0],inplace=True)

## Handling Categorical variables

In [11]:
categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical Variables:", categorical_columns)

Categorical Variables: Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')


## One-Hot Encoding for Categorical Variables

In [12]:
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

## Handling Numerical variables

In [13]:
from sklearn.preprocessing import StandardScaler

numerical_columns=['tenure','MonthlyCharges','TotalCharges']
scaler = StandardScaler()
df[numerical_columns]=scaler.fit_transform(df[numerical_columns])

## Verify Preprocessing
### Check if all missing values are handled:

In [14]:
print(df.isnull().sum())

SeniorCitizen                            0
tenure                                   0
MonthlyCharges                           0
TotalCharges                             0
customerID_0003-MKNFE                    0
                                        ..
PaperlessBilling_Yes                     0
PaymentMethod_Credit card (automatic)    0
PaymentMethod_Electronic check           0
PaymentMethod_Mailed check               0
Churn_Yes                                0
Length: 7073, dtype: int64


In [15]:
print(df.head())

   SeniorCitizen    tenure  MonthlyCharges  TotalCharges  \
0              0 -1.277445       -1.160323     -0.994971   
1              0  0.066327       -0.259629     -0.173876   
2              0 -1.236724       -0.362660     -0.960399   
3              0  0.514251       -0.746535     -0.195400   
4              0 -1.236724        0.197365     -0.941193   

   customerID_0003-MKNFE  customerID_0004-TLHLJ  customerID_0011-IGKFF  \
0                  False                  False                  False   
1                  False                  False                  False   
2                  False                  False                  False   
3                  False                  False                  False   
4                  False                  False                  False   

   customerID_0013-EXCHZ  customerID_0013-MHZWF  customerID_0013-SMEOE  ...  \
0                  False                  False                  False  ...   
1                  False            