##  Prediction for Credit Card Approval 

### Importing Required Libraries

In [173]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Reading Data

In [174]:
# Reading data
# The encoding is latin1 and we need to specify it
train_data = pd.read_csv("train_data.csv", encoding='latin1')

test_data = pd.read_csv("test_data.csv", encoding='latin1')

print("Training Data is:\n")

print(train_data)

print("\nTesting Data is:\n")

print(test_data)

Training Data is:

            ID Gender Has a car Has a property  Children count    Income  \
0      5037048      M         Y              Y               0  135000.0   
1      5044630      F         Y              N               1  135000.0   
2      5079079      F         N              Y               2  180000.0   
3      5112872      F         Y              Y               0  360000.0   
4      5105858      F         N              N               0  270000.0   
...        ...    ...       ...            ...             ...       ...   
29160  5067139      F         N              Y               0  112500.0   
29161  5029193      F         N              Y               1  135000.0   
29162  5047710      F         N              Y               0   76500.0   
29163  5009886      F         N              Y               0  157500.0   
29164  5062632      F         N              Y               0  585000.0   

          Employment status                Education level  \
0     

## 1. Exploratory Data Analysis (EDA)

### Seeing Columns of the Dataset

In [175]:
print("Columns of Dataset are:\n")

train_data.columns

Columns of Dataset are:



Index(['ID', 'Gender', 'Has a car', 'Has a property', 'Children count',
       'Income', 'Employment status', 'Education level', 'Marital status',
       'Dwelling', 'Age', 'Employment length', 'Has a mobile phone',
       'Has a work phone', 'Has a phone', 'Has an email', 'Job title',
       'Family member count', 'Account age', 'Is high risk'],
      dtype='object')

### Data Preprocessing

### Seeing Null values in the dataset

## Only 1 column has null value we need to address this

In [176]:
# Seeing Unique Categories in dataset
print("Unique Categories in dataset are:\n")
print(train_data['Job title'].value_counts())

# First we need to find if this is present in both training and testing data or only in testing data

print("\nShape of Training Dataset is:", train_data.shape)
print("\nTraining Data Null value Count:\n")
print(train_data.isna().sum())

print("\nShape of Testing Dataset is:", test_data.shape)
print("\nTesting Data Null value Count:\n")
print(test_data.isna().sum())

Unique Categories in dataset are:

Laborers                 5004
Core staff               2866
Sales staff              2773
Managers                 2422
Drivers                  1722
High skill tech staff    1133
Accountants               998
Medicine staff            956
Cooking staff             521
Security staff            464
Cleaning staff            425
Private service staff     287
Low-skill Laborers        138
Waiters/barmen staff      127
Secretaries               122
HR staff                   72
Realty agents              60
IT staff                   48
Name: Job title, dtype: int64

Shape of Training Dataset is: (29165, 20)

Training Data Null value Count:

ID                        0
Gender                    0
Has a car                 0
Has a property            0
Children count            0
Income                    0
Employment status         0
Education level           0
Marital status            0
Dwelling                  0
Age                       0
Employment

### Since the missing values are less in comparison to actual shape of dataset we will drop them

In [177]:
train_data.dropna(inplace = True)

test_data.dropna(inplace = True)

### Seeing the description of output class

### Training Dataset

In [178]:
print("Shape of Training Dataset is:", train_data.shape)

print("\nOutput Class Value Count in training dataset is:\n")
print(train_data['Is high risk'].value_counts())

Shape of Training Dataset is: (20138, 20)

Output Class Value Count in training dataset is:

0    19794
1      344
Name: Is high risk, dtype: int64


In [179]:
# Balancing the dataset
# First we will extract the risk and Not risk Data
Not_risk_data = train_data[train_data['Is high risk'] == 0]
risk_data = train_data[train_data['Is high risk'] == 1]

# Checking the Shape of this Separated Data
print("Shape of Not Fraud data after separating is: ",Not_risk_data.shape)
print("\nShape of Fraud data after separating is: ",risk_data.shape)

Shape of Not Fraud data after separating is:  (19794, 20)

Shape of Fraud data after separating is:  (344, 20)


In [180]:
# We are using the sample function for extracting data
# 344 rows are selected because the number of risk data is 492
# To ensure correct fitting of model we will take the same number of rows for both risk and Not risk Data
Not_risk_Sample = Not_risk_data.sample(n=344)

# Now finally we will concatenate both the samples for Fraud and Not Fraud data
Updated_train_data = pd.concat([Not_risk_Sample, risk_data], axis=0)

# Now seeing the counts in the new dataframe
print("Number/Count of risk and Not risk Data values in Updated Dataframe is:")
Updated_train_data['Is high risk'].value_counts()

Number/Count of risk and Not risk Data values in Updated Dataframe is:


0    344
1    344
Name: Is high risk, dtype: int64

### For Testing Set

In [181]:
print("Shape of Testing Dataset is:", test_data.shape)

print("\nOutput Class Value Count in testing dataset is:\n")
print(test_data['Is high risk'].value_counts())

Shape of Testing Dataset is: (4996, 20)

Output Class Value Count in testing dataset is:

0    4918
1      78
Name: Is high risk, dtype: int64


In [182]:
# Balancing the dataset
# First we will extract the risk and Not risk Data
Not_risk_data = test_data[test_data['Is high risk'] == 0]
risk_data = test_data[test_data['Is high risk'] == 1]

# Checking the Shape of this Separated Data
print("Shape of Not Fraud data after separating is: ",Not_risk_data.shape)
print("\nShape of Fraud data after separating is: ",risk_data.shape)

Shape of Not Fraud data after separating is:  (4918, 20)

Shape of Fraud data after separating is:  (78, 20)


In [183]:
# We are using the sample function for extracting data
# 78 rows are selected because the number of risk data is 78
# To ensure correct fitting of model we will take the same number of rows for both risk and Not risk Data
Not_risk_Sample = Not_risk_data.sample(n=78)

# Now finally we will concatenate both the samples for Risk and Not Risk data
Updated_test_data = pd.concat([Not_risk_Sample, risk_data], axis=0)

# Now seeing the counts in the new dataframe
print("Number/Count of risk and Not risk Data values in Updated Dataframe is:")
Updated_test_data['Is high risk'].value_counts()

Number/Count of risk and Not risk Data values in Updated Dataframe is:


0    78
1    78
Name: Is high risk, dtype: int64

### Seeing Descriptive Statistics of Dataset

In [184]:
print("Descriptive Statistics of Training Dataset are:\n")

Updated_train_data.describe()

Descriptive Statistics of Training Dataset are:



Unnamed: 0,ID,Children count,Income,Age,Employment length,Has a mobile phone,Has a work phone,Has a phone,Has an email,Family member count,Account age,Is high risk
count,688.0,688.0,688.0,688.0,688.0,688.0,688.0,688.0,688.0,688.0,688.0,688.0
mean,5078940.0,0.49564,196811.7,-14685.588663,-2248.677326,1.0,0.280523,0.306686,0.101744,2.264535,-30.792151,0.5
std,42617.25,0.766841,110146.2,3451.669405,1995.826957,0.0,0.449582,0.461453,0.302532,0.931305,15.941545,0.500364
min,5008839.0,0.0,45000.0,-23510.0,-12278.0,1.0,0.0,0.0,0.0,1.0,-60.0,0.0
25%,5045272.0,0.0,135000.0,-17557.25,-3026.75,1.0,0.0,0.0,0.0,2.0,-44.0,0.0
50%,5078804.0,0.0,180000.0,-14301.0,-1697.0,1.0,0.0,0.0,0.0,2.0,-31.5,0.5
75%,5115900.0,1.0,225000.0,-11830.5,-859.25,1.0,1.0,1.0,0.0,3.0,-17.0,1.0
max,5150337.0,4.0,1125000.0,-8168.0,-73.0,1.0,1.0,1.0,1.0,6.0,0.0,1.0


In [185]:
print("\nDescriptive Statistics of Testing Dataset are:\n")

Updated_test_data.describe()


Descriptive Statistics of Testing Dataset are:



Unnamed: 0,ID,Children count,Income,Age,Employment length,Has a mobile phone,Has a work phone,Has a phone,Has an email,Family member count,Account age,Is high risk
count,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0
mean,5087002.0,0.50641,198920.192308,-14854.730769,-2550.948718,1.0,0.294872,0.262821,0.102564,2.326923,-29.525641,0.5
std,40336.05,0.766195,114381.12829,3500.006908,2535.857149,0.0,0.457454,0.441583,0.304366,0.923971,18.176003,0.50161
min,5008945.0,0.0,36000.0,-22800.0,-14810.0,1.0,0.0,0.0,0.0,1.0,-60.0,0.0
25%,5053203.0,0.0,130500.0,-17312.0,-3402.0,1.0,0.0,0.0,0.0,2.0,-46.25,0.0
50%,5091524.0,0.0,180000.0,-14803.0,-1723.0,1.0,0.0,0.0,0.0,2.0,-29.0,0.5
75%,5117792.0,1.0,225000.0,-12225.5,-692.0,1.0,1.0,1.0,0.0,3.0,-12.0,1.0
max,5150353.0,5.0,900000.0,-7489.0,-65.0,1.0,1.0,1.0,1.0,7.0,0.0,1.0


### Seeing Information of Dataset

In [186]:
print("Information of Training Dataset are:\n")

Updated_train_data.info()

Information of Training Dataset are:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 688 entries, 16336 to 28800
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   688 non-null    int64  
 1   Gender               688 non-null    object 
 2   Has a car            688 non-null    object 
 3   Has a property       688 non-null    object 
 4   Children count       688 non-null    int64  
 5   Income               688 non-null    float64
 6   Employment status    688 non-null    object 
 7   Education level      688 non-null    object 
 8   Marital status       688 non-null    object 
 9   Dwelling             688 non-null    object 
 10  Age                  688 non-null    int64  
 11  Employment length    688 non-null    int64  
 12  Has a mobile phone   688 non-null    int64  
 13  Has a work phone     688 non-null    int64  
 14  Has a phone          688 non-null    int64  
 

In [187]:
print("Information of Testing Dataset are:\n")

Updated_test_data.info()

Information of Testing Dataset are:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156 entries, 5970 to 7070
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   156 non-null    int64  
 1   Gender               156 non-null    object 
 2   Has a car            156 non-null    object 
 3   Has a property       156 non-null    object 
 4   Children count       156 non-null    int64  
 5   Income               156 non-null    float64
 6   Employment status    156 non-null    object 
 7   Education level      156 non-null    object 
 8   Marital status       156 non-null    object 
 9   Dwelling             156 non-null    object 
 10  Age                  156 non-null    int64  
 11  Employment length    156 non-null    int64  
 12  Has a mobile phone   156 non-null    int64  
 13  Has a work phone     156 non-null    int64  
 14  Has a phone          156 non-null    int64  
 15 

### Categorical Encoding

In [188]:
from sklearn.preprocessing import LabelEncoder

# Identify object-type columns
object_cols = Updated_train_data.select_dtypes(include=['object']).columns

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Loop through each object-type column and encode
for col in object_cols:
    Updated_train_data[col] = label_encoder.fit_transform(Updated_train_data[col])

# Display the encoded DataFrame
Updated_train_data

Unnamed: 0,ID,Gender,Has a car,Has a property,Children count,Income,Employment status,Education level,Marital status,Dwelling,Age,Employment length,Has a mobile phone,Has a work phone,Has a phone,Has an email,Job title,Family member count,Account age,Is high risk
16336,5074548,1,1,1,0,54000.0,3,4,1,1,-19986,-891,1,1,1,0,16,2.0,-28.0,0
20605,5044600,0,1,1,0,202500.0,3,1,1,1,-21227,-829,1,0,0,0,8,2.0,-22.0,0
9253,5114113,1,1,0,0,157500.0,3,4,2,1,-17054,-712,1,0,1,0,16,1.0,-17.0,0
582,5024182,0,0,1,0,81000.0,0,4,1,1,-15580,-3503,1,0,0,0,14,2.0,-56.0,0
24006,5125391,0,0,1,0,135000.0,2,4,3,1,-10066,-3026,1,1,1,0,6,1.0,-8.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28179,5142740,1,0,0,1,360000.0,3,1,1,1,-11112,-469,1,1,1,0,10,3.0,-45.0,1
28575,5095839,1,0,0,0,225000.0,0,4,1,1,-14882,-1132,1,1,1,0,8,2.0,-39.0,1
28579,5125832,1,0,1,0,270000.0,0,2,3,1,-11625,-195,1,0,0,1,14,1.0,-10.0,1
28718,5061132,1,0,1,0,135000.0,2,1,1,1,-19113,-2816,1,0,1,0,3,2.0,-46.0,1


In [189]:
from sklearn.preprocessing import LabelEncoder

# Identify object-type columns
object_cols = Updated_test_data.select_dtypes(include=['object']).columns

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Loop through each object-type column and encode
for col in object_cols:
    Updated_test_data[col] = label_encoder.fit_transform(Updated_test_data[col])

# Display the encoded DataFrame
Updated_test_data

Unnamed: 0,ID,Gender,Has a car,Has a property,Children count,Income,Employment status,Education level,Marital status,Dwelling,Age,Employment length,Has a mobile phone,Has a work phone,Has a phone,Has an email,Job title,Family member count,Account age,Is high risk
5970,5008945,0,0,0,0,157500.0,0,0,1,1,-13642,-4846,1,0,1,0,5,2.0,-30.0,0
5197,5116088,0,0,1,1,360000.0,0,2,1,1,-15812,-565,1,0,0,0,0,3.0,-43.0,0
1175,5091503,0,0,0,0,135000.0,3,2,1,1,-18757,-2057,1,0,0,0,3,2.0,-54.0,0
31,5026054,1,1,1,1,315000.0,3,0,1,1,-10580,-2566,1,0,0,0,3,3.0,-11.0,0
1940,5065767,0,0,1,0,189000.0,3,2,1,1,-14402,-1059,1,0,0,1,12,2.0,-9.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6557,5069216,0,1,1,0,256500.0,0,0,1,1,-11210,-1724,1,1,0,0,0,2.0,-60.0,1
6622,5096209,1,0,0,0,81000.0,3,2,0,1,-14211,-158,1,1,0,0,7,2.0,-12.0,1
6901,5068779,0,0,0,0,180000.0,3,2,3,5,-13396,-3853,1,1,1,0,10,1.0,-11.0,1
6958,5095815,0,0,0,3,112500.0,3,1,1,2,-15447,-2470,1,1,0,0,12,5.0,-51.0,1


### Normalizing Features

In [190]:
Updated_test_data.columns

Index(['ID', 'Gender', 'Has a car', 'Has a property', 'Children count',
       'Income', 'Employment status', 'Education level', 'Marital status',
       'Dwelling', 'Age', 'Employment length', 'Has a mobile phone',
       'Has a work phone', 'Has a phone', 'Has an email', 'Job title',
       'Family member count', 'Account age', 'Is high risk'],
      dtype='object')

In [191]:
from sklearn.preprocessing import MinMaxScaler

# Identify numeric columns
numeric_cols = ['Income', 'Employment length', 'Account age', 'Age']

# Create a MinMaxScaler instance
min_max_scaler = MinMaxScaler()

# Loop through each numeric column and normalize
for col in numeric_cols:
    Updated_train_data[col] = min_max_scaler.fit_transform(Updated_train_data[[col]])

# Display the normalized DataFrame
Updated_train_data

Unnamed: 0,ID,Gender,Has a car,Has a property,Children count,Income,Employment status,Education level,Marital status,Dwelling,Age,Employment length,Has a mobile phone,Has a work phone,Has a phone,Has an email,Job title,Family member count,Account age,Is high risk
16336,5074548,1,1,1,0,0.008333,3,4,1,1,0.229696,0.932978,1,1,1,0,16,2.0,0.533333,0
20605,5044600,0,1,1,0,0.145833,3,1,1,1,0.148807,0.938058,1,0,0,0,8,2.0,0.633333,0
9253,5114113,1,1,0,0,0.104167,3,4,2,1,0.420806,0.947644,1,0,1,0,16,1.0,0.716667,0
582,5024182,0,0,1,0,0.033333,0,4,1,1,0.516882,0.718968,1,0,0,0,14,2.0,0.066667,0
24006,5125391,0,0,1,0,0.083333,2,4,3,1,0.876287,0.758050,1,1,1,0,6,1.0,0.866667,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28179,5142740,1,0,0,1,0.291667,3,1,1,1,0.808108,0.967554,1,1,1,0,10,3.0,0.250000,1
28575,5095839,1,0,0,0,0.166667,0,4,1,1,0.562378,0.913232,1,1,1,0,8,2.0,0.350000,1
28579,5125832,1,0,1,0,0.208333,0,2,3,1,0.774671,0.990004,1,0,0,1,14,1.0,0.833333,1
28718,5061132,1,0,1,0,0.083333,2,1,1,1,0.286599,0.775256,1,0,1,0,3,2.0,0.233333,1


In [192]:
# Loop through each numeric column and normalize
for col in numeric_cols:
    Updated_test_data[col] = min_max_scaler.fit_transform(Updated_test_data[[col]])

# Display the normalized DataFrame
Updated_test_data

Unnamed: 0,ID,Gender,Has a car,Has a property,Children count,Income,Employment status,Education level,Marital status,Dwelling,Age,Employment length,Has a mobile phone,Has a work phone,Has a phone,Has an email,Job title,Family member count,Account age,Is high risk
5970,5008945,0,0,0,0,0.140625,0,0,1,1,0.598132,0.675754,1,0,1,0,5,2.0,0.500000,0
5197,5116088,0,0,1,1,0.375000,0,2,1,1,0.456404,0.966090,1,0,0,0,0,3.0,0.283333,0
1175,5091503,0,0,0,0,0.114583,3,2,1,1,0.264059,0.864903,1,0,0,0,3,2.0,0.100000,0
31,5026054,1,1,1,1,0.322917,3,0,1,1,0.798119,0.830383,1,0,0,0,3,3.0,0.816667,0
1940,5065767,0,0,1,0,0.177083,3,2,1,1,0.548495,0.932587,1,0,0,1,12,2.0,0.850000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6557,5069216,0,1,1,0,0.255208,0,0,1,1,0.756972,0.887487,1,1,0,0,0,2.0,0.000000,1
6622,5096209,1,0,0,0,0.052083,3,2,0,1,0.560969,0.993693,1,1,0,0,7,2.0,0.800000,1
6901,5068779,0,0,0,0,0.166667,3,2,3,5,0.614199,0.743099,1,1,1,0,10,1.0,0.816667,1
6958,5095815,0,0,0,3,0.088542,3,1,1,2,0.480243,0.836894,1,1,0,0,12,5.0,0.150000,1


## 2. Feature Engineering

### Removing Unnecessary Columns

In [193]:
Updated_train_data.columns

Index(['ID', 'Gender', 'Has a car', 'Has a property', 'Children count',
       'Income', 'Employment status', 'Education level', 'Marital status',
       'Dwelling', 'Age', 'Employment length', 'Has a mobile phone',
       'Has a work phone', 'Has a phone', 'Has an email', 'Job title',
       'Family member count', 'Account age', 'Is high risk'],
      dtype='object')

In [194]:
Updated_train_data = Updated_train_data.drop(columns = ["ID", 'Gender', 'Children count', 'Has a phone', 
                                                        'Has an email', 'Family member count', 'Marital status', 
                                                        'Has a work phone', 'Age', 'Dwelling', 'Education level'
                                                        ,'Has a car'])
 
Updated_test_data = Updated_test_data.drop(columns = ["ID", 'Gender', 'Children count', 'Has a phone', 
                                                        'Has an email', 'Family member count', 'Marital status', 
                                                        'Has a work phone', 'Age', 'Dwelling', 'Education level'
                                                         ,'Has a car'])

In [195]:
# Saving the dataframe as CSV file
Updated_train_data.to_csv("Updated_Train_Data.csv")

# Saving the dataframe as CSV file
Updated_test_data.to_csv("Updated_Test_Data.csv")

### Splitting into Input and Output

### Training Set

In [196]:
x_train, y_train = Updated_train_data.drop(columns = ['Is high risk'], axis = 1), Updated_train_data['Is high risk']

x_train = x_train.values
y_train = y_train.values

print("Shape of Input is:", x_train.shape)
print("\nShape of Output is:", y_train.shape)

Shape of Input is: (688, 7)

Shape of Output is: (688,)


### Testing Set

In [197]:
x_test, y_test = Updated_test_data.drop(columns = ['Is high risk'], axis = 1), Updated_test_data['Is high risk']

x_test = x_test.values
y_test = y_test.values

print("Shape of Input is:", x_test.shape)
print("\nShape of Output is:", y_test.shape)

Shape of Input is: (156, 7)

Shape of Output is: (156,)


## 3. Machine Learning Model Development

In [198]:
# Importing Algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Creating Object for Algorithms
# Create instances of classifiers using a dictionary
classifiers_dict = {
    "K Nearest Neighbor": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Naive Bayes Classifier": GaussianNB(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression()
}

In [199]:
# For Ignoring Warnings
import warnings
warnings.filterwarnings("ignore")

# We have dictionary so we will directly use it
for classifier_name, classifier_instance in classifiers_dict.items():
    classifier_instance.fit(x_train, y_train)
    print(f"Successfully Trained {classifier_name} Classifier.")

Successfully Trained K Nearest Neighbor Classifier.
Successfully Trained Support Vector Machine Classifier.
Successfully Trained Decision Tree Classifier Classifier.
Successfully Trained Naive Bayes Classifier Classifier.
Successfully Trained Random Forest Classifier Classifier.
Successfully Trained Logistic Regression Classifier.


## 5. Model Evaluation

In [200]:
# For Showing Heading in Jupyter Notebook
from IPython.display import display, HTML

# Importing Required Functions
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create a dictionary to store accuracies for each model
accuracy_dict = {}

for classifier_name, classifier_instance in classifiers_dict.items():

    # Display model name
    display(HTML(f"<h1 style='font-size:20 px;'>Evaluation Measures for Algorithm {classifier_name} are:</h2><br>"))

    # Training phase
    pred_train = classifier_instance.predict(x_train)

    # Training Accuracy and Classification report and Confusion Matrix
    display(HTML(f"<h2 style='font-size: 14 px'>Training Accuracy is: {accuracy_score(y_train, pred_train)*100} </h1><br>"))
    print("\nConfusion Matrix is:\n")
    print(confusion_matrix(y_train, pred_train))
    print("\nClassification Report is:\n")
    print(classification_report(y_train, pred_train))

    # Testing phase
    pred_test = classifier_instance.predict(x_test)

    # Testing Accuracy and Classification report and Confusion Matrix
    testing_accuracy = accuracy_score(y_test, pred_test) * 100
    display(HTML(f"<h2 style='font-size: 14 px'>Testing Accuracy is: {testing_accuracy} </h2><br>"))
    print("\nConfusion Matrix is:\n")
    print(confusion_matrix(y_test, pred_test))
    print("\nClassification Report is:\n")
    print(classification_report(y_test, pred_test))

    # Store the testing accuracy in the dictionary
    accuracy_dict[classifier_name] = testing_accuracy


Confusion Matrix is:

[[245  99]
 [ 85 259]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.74      0.71      0.73       344
           1       0.72      0.75      0.74       344

    accuracy                           0.73       688
   macro avg       0.73      0.73      0.73       688
weighted avg       0.73      0.73      0.73       688




Confusion Matrix is:

[[33 45]
 [27 51]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.55      0.42      0.48        78
           1       0.53      0.65      0.59        78

    accuracy                           0.54       156
   macro avg       0.54      0.54      0.53       156
weighted avg       0.54      0.54      0.53       156




Confusion Matrix is:

[[160 184]
 [122 222]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.57      0.47      0.51       344
           1       0.55      0.65      0.59       344

    accuracy                           0.56       688
   macro avg       0.56      0.56      0.55       688
weighted avg       0.56      0.56      0.55       688




Confusion Matrix is:

[[30 48]
 [16 62]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.65      0.38      0.48        78
           1       0.56      0.79      0.66        78

    accuracy                           0.59       156
   macro avg       0.61      0.59      0.57       156
weighted avg       0.61      0.59      0.57       156




Confusion Matrix is:

[[344   0]
 [  0 344]]

Classification Report is:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       344
           1       1.00      1.00      1.00       344

    accuracy                           1.00       688
   macro avg       1.00      1.00      1.00       688
weighted avg       1.00      1.00      1.00       688




Confusion Matrix is:

[[46 32]
 [38 40]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.55      0.59      0.57        78
           1       0.56      0.51      0.53        78

    accuracy                           0.55       156
   macro avg       0.55      0.55      0.55       156
weighted avg       0.55      0.55      0.55       156




Confusion Matrix is:

[[180 164]
 [ 96 248]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.65      0.52      0.58       344
           1       0.60      0.72      0.66       344

    accuracy                           0.62       688
   macro avg       0.63      0.62      0.62       688
weighted avg       0.63      0.62      0.62       688




Confusion Matrix is:

[[40 38]
 [27 51]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.60      0.51      0.55        78
           1       0.57      0.65      0.61        78

    accuracy                           0.58       156
   macro avg       0.59      0.58      0.58       156
weighted avg       0.59      0.58      0.58       156




Confusion Matrix is:

[[344   0]
 [  0 344]]

Classification Report is:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       344
           1       1.00      1.00      1.00       344

    accuracy                           1.00       688
   macro avg       1.00      1.00      1.00       688
weighted avg       1.00      1.00      1.00       688




Confusion Matrix is:

[[51 27]
 [29 49]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.64      0.65      0.65        78
           1       0.64      0.63      0.64        78

    accuracy                           0.64       156
   macro avg       0.64      0.64      0.64       156
weighted avg       0.64      0.64      0.64       156




Confusion Matrix is:

[[206 138]
 [121 223]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.63      0.60      0.61       344
           1       0.62      0.65      0.63       344

    accuracy                           0.62       688
   macro avg       0.62      0.62      0.62       688
weighted avg       0.62      0.62      0.62       688




Confusion Matrix is:

[[45 33]
 [28 50]]

Classification Report is:

              precision    recall  f1-score   support

           0       0.62      0.58      0.60        78
           1       0.60      0.64      0.62        78

    accuracy                           0.61       156
   macro avg       0.61      0.61      0.61       156
weighted avg       0.61      0.61      0.61       156



## Finding the Model with Highest accuracy.

In [201]:
# Finding the model with the maximum testing accuracy
best_model = max(accuracy_dict, key=accuracy_dict.get)
display(HTML(f"<h3 style='font-size: 14 px'>The model with the Maximum Testing accuracy is: {best_model} with accuracy {accuracy_dict[best_model]}</h3>"))

### Implementing Hyperparameter Tuning

### Creating Parameter Grid For each Model
### Since SVC will take much resources we are removing it

In [202]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Create instances of classifiers using a dictionary
classifiers_dict = {
    "K Nearest Neighbor": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Naive Bayes Classifier": GaussianNB(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression()
}

param_grid = {
    'K Nearest Neighbor': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute']},

    'Decision Tree Classifier': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy'],
    },

    'Naive Bayes Classifier': {},  # Naive Bayes doesn't have hyperparameters

    'Random Forest Classifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy'],
    },

    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200, 300],
    }
}

# Create a dictionary to store grid search results and best models
grid_search_results = {}

### Looping through each classifier and giving the parameters and evaluating.

In [203]:
# Importing Required Functions
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create a dictionary to store grid search results and best models
grid_search_results = {}

# Loop through each classifier
for classifier_name, classifier_instance in classifiers_dict.items():

    # Display model name
    display(HTML(f"<h1 style='font-size:20 px;'>Grid Search and Evaluation Measures for Algorithm {classifier_name} are:</h2><br>"))

    # Set up GridSearchCV
    grid_search = GridSearchCV(classifier_instance, param_grid[classifier_name], scoring=make_scorer(accuracy_score), cv=5, n_jobs=-1)

    # Fit the model using GridSearchCV
    grid_search.fit(x_train, y_train)

    # Get the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    # Store grid search results
    grid_search_results[classifier_name] = {
        'best_model': best_model,
        'grid_search': grid_search
    }

    # Display best parameters
    # Display best parameters without brackets
    best_params_str = ', '.join(f'{key}: {value}' for key, value in grid_search.best_params_.items())
    display(HTML(f"<h2 style='font-size: 14 px'>Best Parameters are: {best_params_str} </h2><br>"))


    # Display evaluation metrics for the best model
    pred_train = best_model.predict(x_train)
    display(HTML(f"<h2 style='font-size: 14 px'>Best Model Training Accuracy is: {accuracy_score(y_train, pred_train)*100} </h1><br>"))
    print("\nConfusion Matrix for Best Model on Training Data is:\n")
    print(confusion_matrix(y_train, pred_train))
    print("\nClassification Report for Best Model on Training Data is:\n")
    print(classification_report(y_train, pred_train))

    pred_test = best_model.predict(x_test)

    # Testing Accuracy and Classification report and Confusion Matrix
    testing_accuracy = accuracy_score(y_test, pred_test) * 100
    display(HTML(f"<h2 style='font-size: 14 px'>Best Model Testing Accuracy is: {testing_accuracy} </h2><br>"))
    print("\nConfusion Matrix for Best Model on Testing Data is:\n")
    print(confusion_matrix(y_test, pred_test))
    print("\nClassification Report for Best Model on Testing Data is:\n")
    print(classification_report(y_test, pred_test))



Confusion Matrix for Best Model on Training Data is:

[[344   0]
 [  0 344]]

Classification Report for Best Model on Training Data is:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       344
           1       1.00      1.00      1.00       344

    accuracy                           1.00       688
   macro avg       1.00      1.00      1.00       688
weighted avg       1.00      1.00      1.00       688




Confusion Matrix for Best Model on Testing Data is:

[[34 44]
 [27 51]]

Classification Report for Best Model on Testing Data is:

              precision    recall  f1-score   support

           0       0.56      0.44      0.49        78
           1       0.54      0.65      0.59        78

    accuracy                           0.54       156
   macro avg       0.55      0.54      0.54       156
weighted avg       0.55      0.54      0.54       156




Confusion Matrix for Best Model on Training Data is:

[[300  44]
 [ 50 294]]

Classification Report for Best Model on Training Data is:

              precision    recall  f1-score   support

           0       0.86      0.87      0.86       344
           1       0.87      0.85      0.86       344

    accuracy                           0.86       688
   macro avg       0.86      0.86      0.86       688
weighted avg       0.86      0.86      0.86       688




Confusion Matrix for Best Model on Testing Data is:

[[53 25]
 [40 38]]

Classification Report for Best Model on Testing Data is:

              precision    recall  f1-score   support

           0       0.57      0.68      0.62        78
           1       0.60      0.49      0.54        78

    accuracy                           0.58       156
   macro avg       0.59      0.58      0.58       156
weighted avg       0.59      0.58      0.58       156




Confusion Matrix for Best Model on Training Data is:

[[180 164]
 [ 96 248]]

Classification Report for Best Model on Training Data is:

              precision    recall  f1-score   support

           0       0.65      0.52      0.58       344
           1       0.60      0.72      0.66       344

    accuracy                           0.62       688
   macro avg       0.63      0.62      0.62       688
weighted avg       0.63      0.62      0.62       688




Confusion Matrix for Best Model on Testing Data is:

[[40 38]
 [27 51]]

Classification Report for Best Model on Testing Data is:

              precision    recall  f1-score   support

           0       0.60      0.51      0.55        78
           1       0.57      0.65      0.61        78

    accuracy                           0.58       156
   macro avg       0.59      0.58      0.58       156
weighted avg       0.59      0.58      0.58       156




Confusion Matrix for Best Model on Training Data is:

[[338   6]
 [ 12 332]]

Classification Report for Best Model on Training Data is:

              precision    recall  f1-score   support

           0       0.97      0.98      0.97       344
           1       0.98      0.97      0.97       344

    accuracy                           0.97       688
   macro avg       0.97      0.97      0.97       688
weighted avg       0.97      0.97      0.97       688




Confusion Matrix for Best Model on Testing Data is:

[[48 30]
 [29 49]]

Classification Report for Best Model on Testing Data is:

              precision    recall  f1-score   support

           0       0.62      0.62      0.62        78
           1       0.62      0.63      0.62        78

    accuracy                           0.62       156
   macro avg       0.62      0.62      0.62       156
weighted avg       0.62      0.62      0.62       156




Confusion Matrix for Best Model on Training Data is:

[[204 140]
 [117 227]]

Classification Report for Best Model on Training Data is:

              precision    recall  f1-score   support

           0       0.64      0.59      0.61       344
           1       0.62      0.66      0.64       344

    accuracy                           0.63       688
   macro avg       0.63      0.63      0.63       688
weighted avg       0.63      0.63      0.63       688




Confusion Matrix for Best Model on Testing Data is:

[[43 35]
 [27 51]]

Classification Report for Best Model on Testing Data is:

              precision    recall  f1-score   support

           0       0.61      0.55      0.58        78
           1       0.59      0.65      0.62        78

    accuracy                           0.60       156
   macro avg       0.60      0.60      0.60       156
weighted avg       0.60      0.60      0.60       156



## Finding the Model with Highest accuracy after Hyperparameter Tuning

In [204]:
best_model_name = max(grid_search_results, key=lambda k: accuracy_score(y_test, grid_search_results[k]['best_model'].predict(x_test)))
best_model = grid_search_results[best_model_name]['best_model']
display(HTML(f"<h3 style='font-size: 14 px'>The model with the Maximum Testing accuracy after Hyperparameter Tuning is: {best_model_name} with accuracy {accuracy_score(y_test, best_model.predict(x_test))*100}</h3>"))