Customer churn Prediction
Data Collection & Cleaning

Before Cleaning

In [1]:
# 📘 Step 1: Import Required Libraries
import pandas as pd
import numpy as np

# 📘 Step 2: Load the Dataset
# Make sure 'Customer_Churn.csv' is in the same folder as this notebook
df = pd.read_csv("customer_churn_dataset.csv")

# 📊 Step 3: Show First 5 Rows (Before Cleaning)
print("----- BEFORE CLEANING -----")
df.head()

----- BEFORE CLEANING -----


Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1,22,Female,25,14,4,27,Basic,Monthly,598,9,1
1,2,41,Female,28,28,7,13,Standard,Monthly,584,20,0
2,3,47,Male,27,10,2,29,Premium,Annual,757,21,0
3,4,35,Male,9,12,5,17,Premium,Quarterly,232,18,0
4,5,53,Female,58,24,9,2,Standard,Annual,533,18,0


In [2]:
# 📊 Step 4: Check for Missing Values and Data Info
print("Dataset Information (Before Cleaning):")
df.info()

print("\nMissing Values (Before Cleaning):")
print(df.isnull().sum())

Dataset Information (Before Cleaning):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CustomerID         64374 non-null  int64 
 1   Age                64374 non-null  int64 
 2   Gender             64374 non-null  object
 3   Tenure             64374 non-null  int64 
 4   Usage Frequency    64374 non-null  int64 
 5   Support Calls      64374 non-null  int64 
 6   Payment Delay      64374 non-null  int64 
 7   Subscription Type  64374 non-null  object
 8   Contract Length    64374 non-null  object
 9   Total Spend        64374 non-null  int64 
 10  Last Interaction   64374 non-null  int64 
 11  Churn              64374 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 5.9+ MB

Missing Values (Before Cleaning):
CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency  

In [4]:
# 🧹 Step 5: Data Cleaning (Updated Version — No Warnings)
import pandas as pd

# Load dataset again
df = pd.read_csv("customer_churn_dataset.csv")
# Drop duplicates (if any)
df.drop_duplicates(inplace=True)

# Handle missing values for numeric columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].mean())

# Handle missing values for categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Convert 'TotalCharges' to numeric if present
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].mean())

After Cleaning

In [6]:
# 📊 Step 6: Show Dataset Info (After Cleaning)
print("----- AFTER CLEANING -----")
df.info()

print("\nMissing Values (After Cleaning):")
print(df.isnull().sum())

----- AFTER CLEANING -----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CustomerID         64374 non-null  int64 
 1   Age                64374 non-null  int64 
 2   Gender             64374 non-null  object
 3   Tenure             64374 non-null  int64 
 4   Usage Frequency    64374 non-null  int64 
 5   Support Calls      64374 non-null  int64 
 6   Payment Delay      64374 non-null  int64 
 7   Subscription Type  64374 non-null  object
 8   Contract Length    64374 non-null  object
 9   Total Spend        64374 non-null  int64 
 10  Last Interaction   64374 non-null  int64 
 11  Churn              64374 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 5.9+ MB

Missing Values (After Cleaning):
CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support

Compare Before vs After Cleaning (Summary)


In [None]:
# 📊 Step 7: # 📊 Step 7: Compare Before vs After Cleaning (Summary)
before_shape = (7043, 21)  # example original shape — adjust if needed
after_shape = df.shape

print("Dataset Shape Before Cleaning:", before_shape)
print("Dataset Shape After Cleaning:", after_shape)

print("\n✅ Cleaning Completed Successfully!")
before_shape = (7043, 21)  # example original shape — adjust if needed
after_shape = df.shape

print("Dataset Shape Before Cleaning:", before_shape)
print("Dataset Shape After Cleaning:", after_shape)

print("\n✅ Cleaning Completed Successfully!")

Dataset Shape Before Cleaning: (7043, 21)
Dataset Shape After Cleaning: (64374, 12)

✅ Cleaning Completed Successfully!
