In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score
from sklearn.model_selection import GridSearchCV


In [2]:
data = pd.read_csv('Churn_Modelling.csv')

In [3]:
print(data.columns)

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')


In [5]:
data = data.drop(columns=['RowNumber','CustomerId','Surname'])

In [6]:
print(data.columns)

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')


In [7]:
label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])

In [8]:
X = data.drop(columns=['Exited'])
y = data['Exited']

In [10]:
X_train , X_test ,y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
log_reg  = LogisticRegression(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)

In [16]:

gb_clf.fit(X_train , y_train)

In [14]:
log_reg.fit(X_train , y_train)


In [15]:
rf_clf.fit(X_train , y_train)


In [29]:
y_pred_log_reg = log_reg.predict(X_test)
y_pred_rf = rf_clf.predict(X_test)
y_pred_gb = gb_clf.predict(X_test)

In [23]:
def evaluate_model(y_test,y_pred):
    accuracy =accuracy_score(y_test , y_pred)
    precision = precision_score(y_test , y_pred)
    recall = recall_score(y_test , y_pred)
    f1 = f1_score(y_test,y_pred)
    return accuracy , precision , recall , f1

In [25]:
from sklearn.ensemble import GradientBoostingClassifier


In [26]:
model_gb = GradientBoostingClassifier()
model_gb.fit(X_train, y_train)

In [27]:
y_pred_gb = model_gb.predict(X_test)  


In [28]:
log_reg_metrics = evaluate_model(y_test , y_pred_log_reg)
rf_metrics = evaluate_model(y_test , y_pred_rf)
gb_metrics = evaluate_model(y_test , y_pred_gb)

In [30]:
print(f"logistic regressionn metrics:{log_reg_metrics}")

logistic regressionn metrics:(0.815, 0.5966386554621849, 0.1806615776081425, 0.27734375)


In [32]:
print(f"gradient boosting metrices:{gb_metrics}")

gradient boosting metrices:(0.8655, 0.7540983606557377, 0.4681933842239186, 0.5777080062794349)


In [33]:
param_grid = {
    'n_estimators':[100,200,300],
    'max_depth':[None,10,20,30],
    'min_samples_split':[2,5,10]
}

In [34]:
grid_search = GridSearchCV(estimator = rf_clf , param_grid = param_grid , cv=3 , n_jobs = -1 , verbose=2)

In [36]:
best_params = grid_search.best_params_

In [37]:
best_score = grid_search.best_score_

In [38]:
print (f"best parameters:{best_params}")
print (f"best cross-validation score :{best_score}")

best parameters:{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
best cross-validation score :0.861499636251395


In [39]:
best_rf_clf = RandomForestClassifier(**best_params,random_state=42)
best_rf_clf.fit(X_train , y_train)

In [40]:
y_pred_best_rf = best_rf_clf.predict(X_test)
best_rf_metrics = evaluate_model(y_test , y_pred_best_rf)


In [42]:
print(f"best random forest metrics:{best_rf_metrics}")

best random forest metrics:(0.8655, 0.7924528301886793, 0.42748091603053434, 0.5553719008264463)
