## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [90]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings

#### Import the CSV Data as Pandas DataFrame

In [91]:
df = pd.read_csv('data/Churn_Modelling.csv')
df.shape

(10000, 14)

#### Show Top 5 Records

In [92]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


#### Preparing X and Y variables

In [93]:
X = df.drop(columns=['CustomerId','RowNumber','Surname','Exited'],axis=1)

In [94]:
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [95]:
print("Categories in 'Geography' variable:     ",end=" " )
print(df['Geography'].unique())

print("Categories in 'Gender' variable:  ",end=" ")
print(df['Gender'].unique())

Categories in 'Geography' variable:      ['France' 'Spain' 'Germany']
Categories in 'Gender' variable:   ['Female' 'Male']


In [96]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [97]:
y = df['Exited']

In [98]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [99]:
num_features

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary'],
      dtype='object')

In [100]:
X = preprocessor.fit_transform(X)


In [101]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  0.64609167,
         0.97024255,  0.02188649],
       [ 0.        ,  0.        ,  1.        , ..., -1.54776799,
         0.97024255,  0.21653375],
       [ 1.        ,  0.        ,  0.        , ...,  0.64609167,
        -1.03067011,  0.2406869 ],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -1.54776799,
         0.97024255, -1.00864308],
       [ 0.        ,  1.        ,  0.        , ...,  0.64609167,
        -1.03067011, -0.12523071],
       [ 1.        ,  0.        ,  0.        , ...,  0.64609167,
        -1.03067011, -1.07636976]])

In [102]:
X.shape

(10000, 13)

In [103]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape, X_test.shape

((7000, 13), (3000, 13))

#### Create an Evaluate Function to give all metrics after model Training

In [104]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1


In [105]:

# Define models for classification
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

model_list = []
accuracy_list = []

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate model
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    model_list.append(model_name)
    accuracy_list.append(test_accuracy)
    
    print(model_name)
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(train_accuracy))
    print('\n')
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(test_accuracy))
    print('\n')
    print('='*35)
    print('\n')

# Find the best performing model
best_model_idx = accuracy_list.index(max(accuracy_list))
best_model_name = model_list[best_model_idx]
print("Best performing model:", best_model_name)


Logistic Regression
Model performance for Training set
- Accuracy: 0.8103


Model performance for Test set
- Accuracy: 0.8113




K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.8764


Model performance for Test set
- Accuracy: 0.8403




Decision Tree Classifier
Model performance for Training set
- Accuracy: 1.0000


Model performance for Test set
- Accuracy: 0.8053




Random Forest Classifier
Model performance for Training set
- Accuracy: 1.0000


Model performance for Test set
- Accuracy: 0.8680




AdaBoost Classifier
Model performance for Training set
- Accuracy: 0.8553


Model performance for Test set
- Accuracy: 0.8620




Best performing model: Random Forest Classifier


In [106]:
# Define models for classification
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

accuracy_dict = {}  # Dictionary to store model accuracies

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate model
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    accuracy_dict[model_name] = test_accuracy  # Store test accuracy in dictionary
    
    # print(model_name)
    # print('Model performance for Training set')
    # print("- Accuracy: {:.4f}".format(train_accuracy))
    # print('\n')
    # print('Model performance for Test set')
    # print("- Accuracy: {:.4f}".format(test_accuracy))
    # print('\n')
    # print('='*35)
    # print('\n')

accuracy_dict

# Find the best performing model
best_model_name = max(accuracy_dict, key=accuracy_dict.get)
best_model_accuracy = accuracy_dict[best_model_name]
print("Best performing model:", best_model_name)
print("Accuracy of best performing model:", best_model_accuracy)


Best performing model: Random Forest Classifier
Accuracy of best performing model: 0.868


In [107]:
# best_model_name

# model = models[best_model_name]
# model.accura


### Results

In [108]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'accuracy']).sort_values(by=["accuracy"],ascending=False)

# model_list

Unnamed: 0,Model Name,accuracy
3,Random Forest Classifier,0.868
4,AdaBoost Classifier,0.862
1,K-Neighbors Classifier,0.840333
0,Logistic Regression,0.811333
2,Decision Tree Classifier,0.805333


## Logistic Regression

In [109]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# 2. Use the trained model to predict labels for the test data
y_pred = model.predict(X_test)

# 3. Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print("Logistic Regression Accuracy:", accuracy)


Logistic Regression Accuracy: 0.8113333333333334


In [110]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. Train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 2. Use the trained model to predict labels for the test data
y_pred = model.predict(X_test)

# 3. Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print("Random Forest Accuracy:", accuracy)


Random Forest Accuracy: 0.87


In [111]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# 1. Train the XGBoost model
model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)

# 2. Use the trained model to predict labels for the test data
y_pred_xgb = model_xgb.predict(X_test)

# 3. Calculate the accuracy of the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print("XGBoost Accuracy:", accuracy_xgb)


from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'ROC AUC Score: {roc_auc:.2f}')

XGBoost Accuracy: 0.866
Precision: 0.77
Recall: 0.47
F1 Score: 0.58
ROC AUC Score: 0.85


In [112]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# 1. Train the CatBoost model
model_cat = CatBoostClassifier()
model_cat.fit(X_train, y_train)

# 2. Use the trained model to predict labels for the test data
y_pred_cat = model_cat.predict(X_test)

# 3. Calculate the accuracy of the CatBoost model
accuracy_cat = accuracy_score(y_test, y_pred_cat)

print("CatBoost Accuracy:", accuracy_cat)


from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'ROC AUC Score: {roc_auc:.2f}')


Learning rate set to 0.023648
0:	learn: 0.6745132	total: 4.16ms	remaining: 4.15s
1:	learn: 0.6596611	total: 6.71ms	remaining: 3.35s
2:	learn: 0.6420350	total: 9.96ms	remaining: 3.31s
3:	learn: 0.6259242	total: 13.5ms	remaining: 3.36s
4:	learn: 0.6105638	total: 18ms	remaining: 3.58s
5:	learn: 0.5962061	total: 20.8ms	remaining: 3.45s
6:	learn: 0.5819227	total: 23.7ms	remaining: 3.36s
7:	learn: 0.5684988	total: 26.5ms	remaining: 3.28s
8:	learn: 0.5560580	total: 29.3ms	remaining: 3.22s
9:	learn: 0.5461308	total: 33.7ms	remaining: 3.34s
10:	learn: 0.5353059	total: 38.1ms	remaining: 3.43s
11:	learn: 0.5249049	total: 41.2ms	remaining: 3.39s
12:	learn: 0.5168687	total: 44ms	remaining: 3.34s
13:	learn: 0.5094566	total: 47.4ms	remaining: 3.34s
14:	learn: 0.5027829	total: 50.3ms	remaining: 3.3s
15:	learn: 0.4963175	total: 53.1ms	remaining: 3.27s
16:	learn: 0.4875542	total: 55.9ms	remaining: 3.23s
17:	learn: 0.4815962	total: 58.7ms	remaining: 3.2s
18:	learn: 0.4763628	total: 61.5ms	remaining: 3.17

In [113]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
# from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.impute import SimpleImputer
# import numpy as np

# # Pipeline with preprocessing and model
# pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('scaler', StandardScaler()),
#     ('classifier', RandomForestClassifier(random_state=42))
# ])

# # Hyperparameter tuning using GridSearchCV
# param_grid = {
#     'classifier__n_estimators': [100, 200, 300],
#     'classifier__max_depth': [None, 10, 20, 30],
#     'classifier__min_samples_split': [2, 5, 10],
#     'classifier__min_samples_leaf': [1, 2, 4]
# }

# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Best model
# best_model = grid_search.best_estimator_

# # Predict probabilities for adjusting threshold
# y_prob = best_model.predict_proba(X_test)[:, 1]

# # Adjust classification threshold
# threshold = 0.3  # Example threshold, you can tune this
# y_pred_adjusted = (y_prob >= threshold).astype(int)

# # Calculate metrics
# accuracy = accuracy_score(y_test, y_pred_adjusted)
# precision = precision_score(y_test, y_pred_adjusted)
# recall = recall_score(y_test, y_pred_adjusted)
# f1 = f1_score(y_test, y_pred_adjusted)
# roc_auc = roc_auc_score(y_test, y_prob)

# # Print metrics
# print("Adjusted Random Forest Model Performance:")
# print(f"Accuracy: {accuracy:.2f}")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f"F1 Score: {f1:.2f}")
# print(f"ROC AUC Score: {roc_auc:.2f}")

# # Detailed classification report
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred_adjusted))


In [114]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'ROC AUC Score: {roc_auc:.2f}')


Precision: 0.77
Recall: 0.47
F1 Score: 0.58
ROC AUC Score: 0.85


In [115]:
# import pickle
# file_path = 'model.pkl'
# # Load the pickled model
# with open(file_path, 'rb') as f:
#     model = pickle.load(f)

# # Example usage: make predictions
# # X_test = ...  # Load or create your test data
# # y_pred = model.predict(X_test)
# y_pred = model.predict(X_test)

# # Example: print some information about the loaded model
# print("Model type:", type(model))
# print("Model details:", model)

# from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# print(f'Precision: {precision:.2f}')
# print(f'Recall: {recall:.2f}')
# print(f'F1 Score: {f1:.2f}')
# print(f'ROC AUC Score: {roc_auc:.2f}')
