In [1]:
# Customer Churn Prediction - Data Wrangling

# Import necessary libraries
import pandas as pd
import numpy as np

In [4]:
# Load the dataset
file_path = r"C:\Users\User\Desktop\bootcamp-fresh\capstone 3\WA_Fn-UseC_-Telco-Customer-Churn.csv"
churn_df = pd.read_csv(file_path)
churn_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
# Display dataset information
print("\nDataset Info:")
churn_df.info()



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-

In [6]:
# Checking for whitespace or empty strings in 'TotalCharges' and converting to NaN
churn_df['TotalCharges'] = churn_df['TotalCharges'].replace(' ', np.nan)

In [7]:
# Convert 'TotalCharges' to numeric
churn_df['TotalCharges'] = pd.to_numeric(churn_df['TotalCharges'], errors='coerce')

# Check for missing values again
print("\nMissing Values After Conversion:\n", churn_df.isnull().sum())



Missing Values After Conversion:
 customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


In [8]:

# Impute missing values in 'TotalCharges' using median
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
churn_df['TotalCharges'] = imputer.fit_transform(churn_df[['TotalCharges']])

In [9]:
# Checking for duplicates
print("\nChecking for Duplicates:")
print(churn_df.duplicated().sum())

# Drop 'customerID' as it is not relevant for analysis
churn_df.drop('customerID', axis=1, inplace=True)



Checking for Duplicates:
0


In [12]:
# Display unique values in categorical columns
print("\nUnique Values in Categorical Columns:")
for col in churn_df.select_dtypes(include=['object']).columns:
    print(f"{col}: {churn_df[col].unique()}")

# Save the cleansed dataset
churn_df.to_csv('Cleansed_Telco_Customer_Churn.csv', index=False)
print("Data Wrangling Completed Successfully and Dataset Saved as 'Cleansed_Telco_Customer_Churn.csv")


Unique Values in Categorical Columns:
gender: ['Female' 'Male']
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn: ['No' 'Yes']
Data Wrangling Completed Successfully and Dataset Saved as 'Cleansed_Telco_Customer_Churn.csv
