CUSTOMER CHURN PREDICTION

1: Import Required Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

2: Load and Inspect the Dataset


In [2]:
# Load the dataset
file_path = r"D:\Mustufahussain\CodSoft\Machine Learning\Dataset\Churn\Churn_Modelling.csv"
data = pd.read_csv(file_path)

# Display basic information about the dataset
print("Dataset Overview:")
print(data.head())  # Display the first few rows
print("\nDataset Info:")
print(data.info())  # Display data types and non-null values
print("\nMissing Values:")
print(data.isnull().sum())  # Check for missing values

Dataset Overview:
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63      

3: Data Preprocessing

In [10]:
# Check the column names in the dataset
print("\nColumn names in the dataset:")
print(data.columns)

# No need to drop RowNumber, CustomerId, Surname, as they are not present in the dataset

# Encode categorical variables (Geography, Gender)
print("\nEncoding categorical variables (Geography, Gender)...")
label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])

# Define features (X) and target (y)
print("\nDefining features (X) and target (y)...")
X = data.drop('Exited', axis=1)  # 'Exited' is the target variable
y = data['Exited']

# Standardize numerical features
print("\nStandardizing numerical features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
print("\nSplitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Summary of Preprocessing
print("\nData Preprocessing Summary:")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")
print("\nData Preprocessing Complete.")


Column names in the dataset:
Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

Encoding categorical variables (Geography, Gender)...

Defining features (X) and target (y)...

Standardizing numerical features...

Splitting data into training and testing sets...

Data Preprocessing Summary:
Shape of X_train: (8000, 10)
Shape of X_test: (2000, 10)
Shape of y_train: (8000,)
Shape of y_test: (2000,)

Data Preprocessing Complete.


4: Train and Evaluate Logistic Regression


In [4]:
print("\nTraining Logistic Regression...")
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

# Predictions
y_pred_logistic = logistic_model.predict(X_test)

# Evaluation
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
conf_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)
class_report_logistic = classification_report(y_test, y_pred_logistic)

print("\nLogistic Regression Evaluation:")
print(f"Accuracy: {accuracy_logistic}")
print(f"Confusion Matrix:\n{conf_matrix_logistic}")
print(f"Classification Report:\n{class_report_logistic}")


Training Logistic Regression...

Logistic Regression Evaluation:
Accuracy: 0.8155
Confusion Matrix:
[[1559   48]
 [ 321   72]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1607
           1       0.60      0.18      0.28       393

    accuracy                           0.82      2000
   macro avg       0.71      0.58      0.59      2000
weighted avg       0.78      0.82      0.77      2000



5: Train and Evaluate Random Forest

In [5]:
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)

print("\nRandom Forest Evaluation:")
print(f"Accuracy: {accuracy_rf}")
print(f"Confusion Matrix:\n{conf_matrix_rf}")
print(f"Classification Report:\n{class_report_rf}")


Training Random Forest...

Random Forest Evaluation:
Accuracy: 0.864
Confusion Matrix:
[[1545   62]
 [ 210  183]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.57       393

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000



6: Train and Evaluate Gradient Boosting

In [6]:
print("\nTraining Gradient Boosting...")
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Predictions
y_pred_gb = gb_model.predict(X_test)

# Evaluation
accuracy_gb = accuracy_score(y_test, y_pred_gb)
conf_matrix_gb = confusion_matrix(y_test, y_pred_gb)
class_report_gb = classification_report(y_test, y_pred_gb)

print("\nGradient Boosting Evaluation:")
print(f"Accuracy: {accuracy_gb}")
print(f"Confusion Matrix:\n{conf_matrix_gb}")
print(f"Classification Report:\n{class_report_gb}")


Training Gradient Boosting...

Gradient Boosting Evaluation:
Accuracy: 0.866
Confusion Matrix:
[[1547   60]
 [ 208  185]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.47      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.87      0.85      2000

