Data Preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv("/content/Churn_Modelling.csv")

In [None]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
dataset = dataset.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [None]:
dataset.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

labelencoder_geography = LabelEncoder()
labelencoder_gender = LabelEncoder()

dataset['Geography'] = labelencoder_geography.fit_transform(dataset['Geography'])
dataset['Gender'] = labelencoder_gender.fit_transform(dataset['Gender'])

In [None]:
dataset.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [None]:
X = dataset.drop(columns=['Exited'])
y = dataset['Exited']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)

(8000, 10)
(2000, 10)


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier()
classifier.fit(X_train, y_train)

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
# classifier.fit(X_train, y_train)

In [None]:
# from sklearn.linear_model import LogisticRegression
# classifier = LogisticRegression(random_state=0)
# classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
print(y_pred)

[0 0 0 ... 0 0 0]


In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.8645

Confusion Matrix:
 [[1527   68]
 [ 203  202]]


Fine tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid ={
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

In [None]:
grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [None]:
print("Best Parameters:", grid_search.best_params_)
best_classifier = grid_search.best_estimator_

Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}


In [None]:
y_pred_tuned = best_classifier.predict(X_test)

In [None]:
print(y_pred_tuned)

[0 0 0 ... 0 0 0]


In [None]:
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
conf_matrix_tuned = confusion_matrix(y_test, y_pred_tuned)
print("\nImproved Accuracy after Tuning:", accuracy_tuned)
print("\nConfusion Matrix after Tuning:\n", conf_matrix_tuned)


Improved Accuracy after Tuning: 0.8645

Confusion Matrix after Tuning:
 [[1524   71]
 [ 200  205]]
