## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/Churn_Modelling.csv')
df.shape

(10000, 14)

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


#### Preparing X and Y variables

In [4]:
X = df.drop(columns=['CustomerId','RowNumber','Surname','Exited'],axis=1)

In [5]:
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [6]:
print("Categories in 'Geography' variable:     ",end=" " )
print(df['Geography'].unique())

print("Categories in 'Gender' variable:  ",end=" ")
print(df['Gender'].unique())

Categories in 'Geography' variable:      ['France' 'Spain' 'Germany']
Categories in 'Gender' variable:   ['Female' 'Male']


In [7]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [8]:
y = df['Exited']

In [9]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [10]:
num_features

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary'],
      dtype='object')

In [11]:
X = preprocessor.fit_transform(X)


In [12]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  0.64609167,
         0.97024255,  0.02188649],
       [ 0.        ,  0.        ,  1.        , ..., -1.54776799,
         0.97024255,  0.21653375],
       [ 1.        ,  0.        ,  0.        , ...,  0.64609167,
        -1.03067011,  0.2406869 ],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -1.54776799,
         0.97024255, -1.00864308],
       [ 0.        ,  1.        ,  0.        , ...,  0.64609167,
        -1.03067011, -0.12523071],
       [ 1.        ,  0.        ,  0.        , ...,  0.64609167,
        -1.03067011, -1.07636976]])

In [13]:
X.shape

(10000, 13)

In [14]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape, X_test.shape

((7000, 13), (3000, 13))

#### Create an Evaluate Function to give all metrics after model Training

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1


In [16]:

# Define models for classification
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

model_list = []
accuracy_list = []

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate model
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    model_list.append(model_name)
    accuracy_list.append(test_accuracy)
    
    print(model_name)
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(train_accuracy))
    print('\n')
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(test_accuracy))
    print('\n')
    print('='*35)
    print('\n')

# Find the best performing model
best_model_idx = accuracy_list.index(max(accuracy_list))
best_model_name = model_list[best_model_idx]
print("Best performing model:", best_model_name)


Logistic Regression
Model performance for Training set
- Accuracy: 0.8103


Model performance for Test set
- Accuracy: 0.8113






K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.8764


Model performance for Test set
- Accuracy: 0.8403




Decision Tree Classifier
Model performance for Training set
- Accuracy: 1.0000


Model performance for Test set
- Accuracy: 0.8080




Random Forest Classifier
Model performance for Training set
- Accuracy: 1.0000


Model performance for Test set
- Accuracy: 0.8687




AdaBoost Classifier
Model performance for Training set
- Accuracy: 0.8553


Model performance for Test set
- Accuracy: 0.8620




Best performing model: Random Forest Classifier


In [17]:
# Define models for classification
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

accuracy_dict = {}  # Dictionary to store model accuracies

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate model
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    accuracy_dict[model_name] = test_accuracy  # Store test accuracy in dictionary
    
    # print(model_name)
    # print('Model performance for Training set')
    # print("- Accuracy: {:.4f}".format(train_accuracy))
    # print('\n')
    # print('Model performance for Test set')
    # print("- Accuracy: {:.4f}".format(test_accuracy))
    # print('\n')
    # print('='*35)
    # print('\n')

accuracy_dict

# Find the best performing model
best_model_name = max(accuracy_dict, key=accuracy_dict.get)
best_model_accuracy = accuracy_dict[best_model_name]
print("Best performing model:", best_model_name)
print("Accuracy of best performing model:", best_model_accuracy)


Best performing model: Random Forest Classifier
Accuracy of best performing model: 0.8703333333333333


In [18]:
best_model_name

model = models[best_model_name]
model.accura


AttributeError: 'RandomForestClassifier' object has no attribute 'accura'

### Results

In [None]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'accuracy']).sort_values(by=["accuracy"],ascending=False)

# model_list

Unnamed: 0,Model Name,accuracy
3,Random Forest Classifier,0.865333
4,AdaBoost Classifier,0.862
1,K-Neighbors Classifier,0.840333
0,Logistic Regression,0.811333
2,Decision Tree Classifier,0.804667


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# 2. Use the trained model to predict labels for the test data
y_pred = model.predict(X_test)

# 3. Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print("Logistic Regression Accuracy:", accuracy)


Logistic Regression Accuracy: 0.8113333333333334
