In [1]:
"""
Lab: Handling Imbalanced Datasets with Sampling Techniques

Objective: 
In this lab, you will work with an imbalanced dataset (Churn_Modelling.csv) and apply various sampling techniques to improve the model's performance on minority classes.

Tasks:
1. Load and preprocess the dataset
2. Implement and evaluate the baseline model
3. Apply undersampling and evaluate
4. Apply oversampling and evaluate
5. Implement SMOTE and evaluate
6. Implement ADASYN and evaluate
7. Compare the results of different sampling techniques

Instructions:
- Fill in the missing code in the sections marked with TODO comments
- Compare the performance of different sampling techniques

"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

In [2]:
data = pd.read_csv("Churn_Modelling.csv")
data1 = pd.read_csv("Churn_Modelling.csv")
data.head(20)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [12]:
majority_class_0=data[data['Exited']==0]
majority_class_0

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,1,1,10062.80,0
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.50,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,9994,15569266,Rahman,644,France,Male,28,7,155060.41,1,1,0,29179.52,0
9994,9995,15719294,Wood,800,France,Female,29,2,0.00,2,0,0,167773.55,0
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0


In [13]:
miniority_class_1=data[data['Exited']==1]

In [14]:
miniority_class_1

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
16,17,15737452,Romeo,653,Germany,Male,58,1,132602.88,1,1,0,5097.67,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9981,9982,15672754,Burbidge,498,Germany,Male,42,3,152039.70,1,1,1,53445.17,1
9982,9983,15768163,Griffin,655,Germany,Female,46,7,137145.12,1,1,0,115146.40,1
9991,9992,15769959,Ajuluchukwu,597,France,Female,53,4,88381.21,1,1,0,69384.71,1
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1


In [33]:
from sklearn.utils import shuffle

data=shuffle(data)
data.head(30)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
3054,3055,15791174,Leibius,540,Spain,Male,67,1,88382.01,1,0,1,59457.0,0
6660,6661,15765258,Bochsa,776,France,Female,29,5,0.0,2,1,1,143301.49,0
442,443,15672145,Swift,534,France,Female,34,7,121551.58,2,1,1,70179.0,0
8418,8419,15731026,Han,683,Germany,Female,39,2,100062.16,2,1,0,109201.43,0
5101,5102,15721462,Shubin,622,Spain,Female,58,2,0.0,2,1,1,33277.31,0
6287,6288,15668289,McWilliams,690,Spain,Male,32,2,76087.98,1,0,1,151822.66,0
3422,3423,15654346,Poninski,679,Germany,Male,35,1,130463.55,2,1,1,37341.17,0
2189,2190,15798310,Palerma,480,France,Male,35,2,165692.91,1,1,1,197984.58,0
9340,9341,15605804,Watson,737,France,Male,45,10,0.0,2,1,0,1364.54,0
8451,8452,15580043,Murray,575,Spain,Female,22,8,105229.34,1,1,1,34397.08,0


In [34]:

# Prepare the features and target
x = data.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)
y = data['Exited'] # the churn data

In [14]:
# Perform one-hot encoding for categorical variables
x = pd.get_dummies(x, drop_first=True, dtype=int)
x.head(10)
x.shape

(10000, 11)

In [30]:
groups=[]
x_train=[]
x_test_holdout=[]
y_test_holdout=miniority_class_1.iloc[1500:,:]
y_train=miniority_class_1.iloc[:1500,:]

N=5

splittingratio=int(len(majority_class_0)/N)

for n in range(N):
    # x_train.rename({"x_train":f"x_train{n}"}),x_test,y_train,y_test

    # x_train.rename({"x_train":f"x_train{n}"})=train_test_split(x, y, test_size= 0.2, random_state=42)

    x__train=majority_class_0.iloc[(n*splittingratio):(n+1)*splittingratio,:]
    
    x_train.append(x__train.rename({"x__train":f"x_train{n}"}))

if len(x_train)==N:
    x_test_holdout.append(x_train[-1])

In [9]:
# Function to train and evaluate the model
def train_and_evaluate(x_train, y_train, x_test, y_test):
    for x in range(len(x_train)):
        
    model = RandomForestClassifier(random_state=42)
    model.fit(x_train, y_train)
    return model.predict(x_test)

In [10]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, train_and_evaluate(x_train, y_train, x_test, y_test))

array([[1548,   59],
       [ 208,  185]], dtype=int64)

In [11]:
#print class-wise accuracies from the confusion matrix, print it

from sklearn.metrics import accuracy_score

accuracy_score(y_test, train_and_evaluate(x_train, y_train, x_test, y_test))

0.8665

In [12]:
# TODO: Implement undersampling on train data and evaluate on test data 
# Undersampling
undersampler = RandomUnderSampler(random_state=42)
x_under, y_under = undersampler.fit_resample(x_train, y_train)
print("\nAfter undersampling:")
train_and_evaluate(x_under, y_under,x_test,y_test)# complete this............



After undersampling:


array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

# Checking accuracy scores classwise, so y_test_1 for 1, and y_test_0 for 0
 

In [37]:
y_test_1=y_test.loc[lambda x:x ==1]
y_test_0=y_test.loc[lambda x:x ==0]

In [13]:
confusion_matrix(y_test, train_and_evaluate(x_under, y_under, x_test, y_test))

array([[1288,  319],
       [  93,  300]], dtype=int64)

In [41]:
accuracy_score(y_test, train_and_evaluate(x_under, y_under, x_test, y_test))

0.794

In [15]:

# TODO: Implement oversampling
# Use RandomOverSampler, go to the documentation and see how to do

oversampler=RandomOverSampler(random_state=42)
x_over,y_over=oversampler.fit_resample(x_train,y_train)
print("\nAfter oversampling:")
train_and_evaluate(x_over,y_over,x_test,y_test)




After oversampling:


array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [21]:
confusion_matrix(y_test, train_and_evaluate(x_over, y_over, x_test, y_test))

array([[1507,  100],
       [ 177,  216]], dtype=int64)

In [22]:
accuracy_score(y_test, train_and_evaluate(x_over, y_over, x_test, y_test))

0.8615

In [18]:
# TODO: Implement SMOTE
# Use SMOTE

smt=SMOTE()
x_smote,y_smote=smt.fit_resample(x_train,y_train)
print("\nAfter applying SMOTE:")
train_and_evaluate(x_smote,y_smote,x_test,y_test)



After applying SMOTE:


array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [19]:
confusion_matrix(y_test, train_and_evaluate(x_smote, y_smote, x_test, y_test))

array([[1395,  212],
       [ 142,  251]], dtype=int64)

In [20]:
accuracy_score(y_test, train_and_evaluate(x_smote, y_smote, x_test, y_test))

0.823

In [24]:
# TODO: Implement ADASYN
# Use ADASYN

ada = ADASYN(random_state=42)
x_ada, y_ada = ada.fit_resample(x_train, y_train)
print("\nAfter applying ADASYN:")
train_and_evaluate(X_ada,y_ada,x_train,y_train)



After applying ADASYN:


array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [25]:
confusion_matrix(y_test, train_and_evaluate(x_ada, y_ada, x_test, y_test))

array([[1379,  228],
       [ 142,  251]], dtype=int64)

In [26]:
accuracy_score(y_test, train_and_evaluate(x_ada, y_ada, x_test, y_test))

0.815


# TODO: Compare and analyze the results
# Write your analysis here

Analysis:
1. Which sampling technique performed the best? Why do you think that is the case?

---> Over sampling performed with 86.15% of accuracy

2. How did the class-wise accuracies change with different sampling techniques?

---> 
    
    1. for Under sampling: less accuracy of 79.15%

    2. for over smapling: 86.15% highest

    3. for SMOTE: 82.3% 2nd highest
    
    4. for ADASYN: 81.5 % 3rd highest