importing dependencies

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE

loading datasets

In [2]:
data = pd.read_csv('Churn_Modelling.csv')

In [3]:
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

In [10]:
print(data.head())
print(data.tail())
print(data.describe())
print(data.isnull().sum)
print(data.count())
print(data.shape)

   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619    France  Female   42       2       0.00              1   
1          608     Spain  Female   41       1   83807.86              1   
2          502    France  Female   42       8  159660.80              3   
3          699    France  Female   39       1       0.00              2   
4          850     Spain  Female   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          1               1        101348.88       1  
1          0               1        112542.58       0  
2          1               0        113931.57       1  
3          0               0         93826.63       0  
4          1               1         79084.10       0  
      CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
9995          771    France    Male   39       5       0.00              2   
9996          516    France    Male   35      10   57369.

encoding variables

In [11]:
label_encoders = {}
for column in ['Geography', 'Gender']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

splitting features from target

In [12]:
X = data.drop(columns=['Exited'])
y = data['Exited']

normalising data

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

handling imbalance


In [16]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



logistic regression



In [17]:
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_train, y_train)

predictions_lr = model_lr.predict(X_test)

accuracy_lr = accuracy_score(y_test, predictions_lr)
report_lr = classification_report(y_test, predictions_lr, zero_division=0)

print("\nLogistic Regression:")
print(f"Accuracy: {accuracy_lr:.2f}")
print("Classification Report:")
print(report_lr)


Logistic Regression:
Accuracy: 0.82
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1607
           1       0.60      0.18      0.28       393

    accuracy                           0.82      2000
   macro avg       0.71      0.58      0.59      2000
weighted avg       0.78      0.82      0.77      2000



random forest

In [18]:
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)

predictions_rf = model_rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, predictions_rf)
report_rf = classification_report(y_test, predictions_rf, zero_division=0)

print("\nRandom Forest:")
print(f"Accuracy: {accuracy_rf:.2f}")
print("Classification Report:")
print(report_rf)


Random Forest:
Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.57       393

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000



gradient boosting

In [19]:
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train, y_train)

predictions_gb = model_gb.predict(X_test)

accuracy_gb = accuracy_score(y_test, predictions_gb)
report_gb = classification_report(y_test, predictions_gb, zero_division=0)

print("\nGradient Boosting:")
print(f"Accuracy: {accuracy_gb:.2f}")
print("Classification Report:")
print(report_gb)


Gradient Boosting:
Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.47      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.87      0.85      2000

