## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score,classification_report
import warnings

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/Churn_Modelling.csv')
df.shape

(10000, 14)

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


#### Preparing X and Y variables

In [4]:
X = df.drop(columns=['CustomerId','RowNumber','Surname','Exited'],axis=1)

In [5]:
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [6]:
print("Categories in 'Geography' variable:     ",end=" " )
print(df['Geography'].unique())

print("Categories in 'Gender' variable:  ",end=" ")
print(df['Gender'].unique())

Categories in 'Geography' variable:      ['France' 'Spain' 'Germany']
Categories in 'Gender' variable:   ['Female' 'Male']


In [7]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [8]:
y = df['Exited']

In [9]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [10]:
num_features

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary'],
      dtype='object')

In [11]:
X = preprocessor.fit_transform(X)


In [12]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  0.64609167,
         0.97024255,  0.02188649],
       [ 0.        ,  0.        ,  1.        , ..., -1.54776799,
         0.97024255,  0.21653375],
       [ 1.        ,  0.        ,  0.        , ...,  0.64609167,
        -1.03067011,  0.2406869 ],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -1.54776799,
         0.97024255, -1.00864308],
       [ 0.        ,  1.        ,  0.        , ...,  0.64609167,
        -1.03067011, -0.12523071],
       [ 1.        ,  0.        ,  0.        , ...,  0.64609167,
        -1.03067011, -1.07636976]])

In [13]:
X.shape

(10000, 13)

In [14]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape, X_test.shape

((7000, 13), (3000, 13))

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1

### RandomOverSampler for handeling imbalanced data

In [16]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)


#### Create an Evaluate Function to give all metrics after model Training

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1


In [18]:

# Define models for classification
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=300),
    # "model_xgb" : XGBClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

model_list = []
accuracy_list = []

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate model
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    model_list.append(model_name)
    accuracy_list.append(test_accuracy)
    
    print(model_name)
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(train_accuracy))
    print('\n')
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(test_accuracy))
    print('\n')
    print('='*35)
    print('\n')

# Find the best performing model
best_model_idx = accuracy_list.index(max(accuracy_list))
best_model_name = model_list[best_model_idx]
print("Best performing model:", best_model_name)


Logistic Regression
Model performance for Training set
- Accuracy: 0.7061


Model performance for Test set
- Accuracy: 0.7043




K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.8959


Model performance for Test set
- Accuracy: 0.7280




Decision Tree Classifier
Model performance for Training set
- Accuracy: 1.0000


Model performance for Test set
- Accuracy: 0.7953




Random Forest Classifier
Model performance for Training set
- Accuracy: 1.0000


Model performance for Test set
- Accuracy: 0.8603




AdaBoost Classifier
Model performance for Training set
- Accuracy: 0.7743


Model performance for Test set
- Accuracy: 0.7930




Best performing model: Random Forest Classifier


In [19]:
# Define models for classification
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=300),
    # "model_xgb" : XGBClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier()

    
}

accuracy_dict = {}  # Dictionary to store model accuracies

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate model
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    accuracy_dict[model_name] = test_accuracy  # Store test accuracy in dictionary
    

accuracy_dict



{'Logistic Regression': 0.7043333333333334,
 'K-Neighbors Classifier': 0.728,
 'Decision Tree Classifier': 0.791,
 'Random Forest Classifier': 0.859,
 'AdaBoost Classifier': 0.793}

In [20]:
# Find the best performing model
best_model_name = max(accuracy_dict, key=accuracy_dict.get)
best_model_accuracy = accuracy_dict[best_model_name]
print("Best performing model:", best_model_name)
print("Accuracy of best performing model:", best_model_accuracy)

Best performing model: Random Forest Classifier
Accuracy of best performing model: 0.859


### Results

In [21]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'accuracy']).sort_values(by=["accuracy"],ascending=False)

# model_list

Unnamed: 0,Model Name,accuracy
3,Random Forest Classifier,0.860333
2,Decision Tree Classifier,0.795333
4,AdaBoost Classifier,0.793
1,K-Neighbors Classifier,0.728
0,Logistic Regression,0.704333


In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score,classification_report

rf = RandomForestClassifier()
lr = LogisticRegression()
ad = AdaBoostClassifier()
dt = DecisionTreeClassifier()
kn = KNeighborsClassifier()


models = [rf, lr, ad, dt, kn]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred, average='macro') * 100
    rec = recall_score(y_test, y_pred, average='macro') * 100
    f1 = f1_score(y_test, y_pred, average='macro') * 100
    scores.append([acc, prec, rec, f1])



In [23]:
scores_df = pd.DataFrame(columns=['Model'], data=['Random Forest','Logistic Regression', 'adaboost', 'Decision Tree',"k-nearest"])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Random Forest,85.533333,77.509689,73.424601,75.135589
1,Logistic Regression,70.433333,63.653333,70.411866,63.713565
2,adaboost,79.3,70.489013,77.669702,72.363114
3,Decision Tree,79.533333,67.44158,67.621961,67.530545
4,k-nearest,72.8,64.294773,70.323131,65.087922


In [24]:
models 

report = classification_report(y_test, rf.predict(X_test), output_dict=True)
print(report)
report  = pd.DataFrame(report).transpose()
obj  = report.to_json()
obj

{'0': {'precision': 0.8926307448494454, 'recall': 0.9325331125827815, 'f1-score': 0.9121457489878543, 'support': 2416.0}, '1': {'precision': 0.657563025210084, 'recall': 0.535958904109589, 'f1-score': 0.590566037735849, 'support': 584.0}, 'accuracy': 0.8553333333333333, 'macro avg': {'precision': 0.7750968850297647, 'recall': 0.7342460083461853, 'f1-score': 0.7513558933618516, 'support': 3000.0}, 'weighted avg': {'precision': 0.8468708954263164, 'recall': 0.8553333333333333, 'f1-score': 0.8495448985307973, 'support': 3000.0}}


'{"precision":{"0":0.8926307448,"1":0.6575630252,"accuracy":0.8553333333,"macro avg":0.775096885,"weighted avg":0.8468708954},"recall":{"0":0.9325331126,"1":0.5359589041,"accuracy":0.8553333333,"macro avg":0.7342460083,"weighted avg":0.8553333333},"f1-score":{"0":0.912145749,"1":0.5905660377,"accuracy":0.8553333333,"macro avg":0.7513558934,"weighted avg":0.8495448985},"support":{"0":2416.0,"1":584.0,"accuracy":0.8553333333,"macro avg":3000.0,"weighted avg":3000.0}}'

In [25]:
rep = pd.read_json(obj)
print(rep)

              precision    recall  f1-score      support
0              0.892631  0.932533  0.912146  2416.000000
1              0.657563  0.535959  0.590566   584.000000
accuracy       0.855333  0.855333  0.855333     0.855333
macro avg      0.775097  0.734246  0.751356  3000.000000
weighted avg   0.846871  0.855333  0.849545  3000.000000


  rep = pd.read_json(obj)
  rep = pd.read_json(obj)
  rep = pd.read_json(obj)
  rep = pd.read_json(obj)


In [26]:
import pandas as pd
from sklearn.metrics import classification_report
import os

# List of models
models = [rf, lr, ad, dt, kn]

# Path to the 'artifacts' folder relative to the current script
artifacts_folder = '../artifacts'
# Ensure the 'artifacts' folder exists
if not os.path.exists(artifacts_folder):
    os.makedirs(artifacts_folder)

# Assuming y_test and X_test are already defined
for model in models:
    model_name = type(model).__name__  # Get the name of the model class
    report = classification_report(y_test, model.predict(X_test), output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    # Convert DataFrame to JSON object
    report_json = report_df.to_json()

    # Write JSON object to a file named after the model in the 'artifacts' folder
    file_path = os.path.join(artifacts_folder, f'{model_name}_classification_report.json')
    with open(file_path, 'w') as json_file:
        json_file.write(report_json)

    print(f'Report for {model_name} saved to {file_path}')


Report for RandomForestClassifier saved to ../artifacts\RandomForestClassifier_classification_report.json
Report for LogisticRegression saved to ../artifacts\LogisticRegression_classification_report.json
Report for AdaBoostClassifier saved to ../artifacts\AdaBoostClassifier_classification_report.json
Report for DecisionTreeClassifier saved to ../artifacts\DecisionTreeClassifier_classification_report.json
Report for KNeighborsClassifier saved to ../artifacts\KNeighborsClassifier_classification_report.json


In [27]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 1. Train the AdaBoost model
model = AdaBoostClassifier(random_state=42, n_estimators=50, learning_rate=1.0)  # Example hyperparameters
model.fit(X_train, y_train)

# 2. Use the trained model to predict labels for the test data
y_pred = model.predict(X_test)

# 3. Calculate the evaluation metrics of the model
accuracy = accuracy_score(y_test, y_pred)
print("AdaBoost Accuracy:", accuracy)



AdaBoost Accuracy: 0.793


### SVM Classifier

In [28]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# # 1. Train the SVM model
# # Use probability=True to enable probability estimates which are required for ROC AUC score
# model = SVC(random_state=42, kernel='rbf', probability=True)
# model.fit(X_train, y_train)

# # 2. Use the trained model to predict labels for the test data
# y_pred = model.predict(X_test)

# # 3. Calculate the accuracy of the model
# accuracy = accuracy_score(y_test, y_pred)

# print("SVM Accuracy:", accuracy)

# # Calculate other evaluation metrics
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# print(f'Precision: {precision:.2f}')
# print(f'Recall: {recall:.2f}')
# print(f'F1 Score: {f1:.2f}')
# print(f'ROC AUC Score: {roc_auc:.2f}')


### XGB Classifier

In [29]:
# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score

# # 1. Train the XGBoost model
# model_xgb = XGBClassifier()
# model_xgb.fit(X_train, y_train)

# # 2. Use the trained model to predict labels for the test data
# y_pred_xgb = model_xgb.predict(X_test)

# # 3. Calculate the accuracy of the XGBoost model
# accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# print("XGBoost Accuracy:", accuracy_xgb)


# from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# print(f'Precision: {precision:.2f}')
# print(f'Recall: {recall:.2f}')
# print(f'F1 Score: {f1:.2f}')
# print(f'ROC AUC Score: {roc_auc:.2f}')