#Loading Dataset

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
#upload dataset
from google.colab import files
uploaded = files.upload()
import io
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve, precision_score, recall_score, precision_recall_curve
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

Saving churn_prediction.csv to churn_prediction.csv


In [None]:
df = pd.read_csv('churn_prediction.csv')

In [None]:
df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,Male,0.0,self_employed,187.0,2,755,224.0,1458.71,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,2,310,35,Male,0.0,self_employed,,2,3214,60.0,5390.37,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,4,2356,31,Male,0.0,salaried,146.0,2,41,,3913.16,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,5,478,90,,,self_employed,1020.0,2,582,147.0,2291.91,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,6,2531,42,Male,2.0,self_employed,1494.0,3,388,58.0,927.72,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1


#Data Cleaning - Missing Values

In [None]:
#Convert Gender
dict_gender = {'Male': 1, 'Female':0}
df.replace({'gender': dict_gender}, inplace = True)

#replaces NaN values with -1
df['gender'] = df['gender'].fillna(-1)

In [None]:
df['dependents'] = df['dependents'].fillna(0)
df['occupation'] = df['occupation'].fillna('self_employed')

In [None]:
df['city'] = df['city'].fillna(1020)

In [None]:
df['days_since_last_transaction'] = df['days_since_last_transaction'].fillna(999)

#Adding Class Labels Column

In [None]:
labels = [1, 2, 3, 4]
df['class_labels'] = pd.qcut(df['current_month_debit'], q=4, labels=labels)

In [None]:
df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,class_labels
0,1,3135,66,1.0,0.0,self_employed,187.0,2,755,224.0,1458.71,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0,1
1,2,310,35,1.0,0.0,self_employed,1020.0,2,3214,60.0,5390.37,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0,4
2,4,2356,31,1.0,0.0,salaried,146.0,2,41,999.0,3913.16,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0,4
3,5,478,90,-1.0,0.0,self_employed,1020.0,2,582,147.0,2291.91,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1,2
4,6,2531,42,1.0,2.0,self_employed,1494.0,3,388,58.0,927.72,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1,3


#Pre-processing

In [None]:
# Convert occupation to one hot encoded features
df = pd.concat([df,pd.get_dummies(df['occupation'],prefix = str('occupation'),prefix_sep='_')],axis = 1)

In [None]:
all_cols = ['customer_nw_category', 'current_balance',
            'previous_month_end_balance', 'average_monthly_balance_prevQ2', 'average_monthly_balance_prevQ',
            'current_month_credit','previous_month_credit', 'current_month_debit',
            'previous_month_debit','current_month_balance', 'previous_month_balance','vintage','age','dependents']

num_cols = ['customer_nw_category', 'current_balance',
            'previous_month_end_balance', 'average_monthly_balance_prevQ2', 'average_monthly_balance_prevQ',
            'current_month_credit','previous_month_credit', 'current_month_debit',
            'previous_month_debit','current_month_balance', 'previous_month_balance']

for i in num_cols:
    df[i] = np.log(df[i] + 17000)

std = StandardScaler()
scaled = std.fit_transform(df[all_cols])
scaled = pd.DataFrame(scaled,columns=all_cols)

#Dividing into Data Frames - Classes

In [None]:
df1 = df[df['class_labels'] == 1]

In [None]:
df1.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,class_labels,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
0,1,3135,66,1.0,0.0,self_employed,187.0,9.741086,755,224.0,9.823292,9.823292,9.823292,9.822769,9.74098,9.74098,9.74098,9.74098,9.823292,9.823292,0,1,0,0,0,1,0
17,20,6111,52,0.0,0.0,self_employed,1096.0,9.741086,32,3.0,10.011989,9.942903,9.94975,9.94903,9.948313,9.740985,9.740985,9.74195,9.959805,9.943235,0,1,0,0,0,1,0
22,25,3101,41,0.0,0.0,self_employed,905.0,9.741086,1388,13.0,10.140868,10.11519,10.119367,10.113211,9.778108,9.740992,9.740992,9.740992,10.127669,10.11519,0,1,0,0,0,1,0
29,32,2204,33,0.0,0.0,salaried,834.0,9.741086,14,999.0,10.049865,10.050254,10.050317,10.050021,9.740975,9.740975,9.740975,9.740975,10.049879,10.050417,0,1,0,0,1,0,0
32,35,1124,54,1.0,0.0,self_employed,1366.0,9.741086,797,999.0,9.918101,9.918101,9.918101,9.917079,9.74099,9.74099,9.74099,9.74099,9.918101,9.918101,0,1,0,0,0,1,0


In [None]:
df2 = df[df['class_labels'] == 2]

In [None]:
df2.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,class_labels,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
3,5,478,90,-1.0,0.0,self_employed,1020.0,9.741086,582,147.0,9.867441,9.867441,9.856634,9.79849,9.740996,9.740996,9.740996,9.85971,9.867441,9.834662,1,2,0,0,0,1,0
10,12,661,68,1.0,0.0,retired,409.0,9.741145,709,5.0,9.848438,9.829382,9.837487,9.784009,9.766717,9.741,9.741,9.753176,9.831268,9.84162,0,2,0,1,0,0,0
14,16,2314,48,0.0,0.0,self_employed,665.0,9.741086,569,52.0,10.018218,10.020788,10.066159,10.115382,9.741007,9.779155,9.741007,9.894019,10.018468,10.070439,1,2,0,0,0,1,0
18,21,5821,47,0.0,1.0,self_employed,146.0,9.741027,490,69.0,9.928556,9.950622,9.969117,9.934172,9.741002,9.741002,9.745194,10.15037,9.933308,9.953327,0,2,0,0,0,1,0
25,28,606,76,1.0,0.0,self_employed,1533.0,9.741145,881,999.0,10.35103,10.35103,10.35103,10.355945,9.741006,9.741006,9.741006,9.741006,10.35103,10.35103,0,2,0,0,0,1,0


In [None]:
df3 = df[df['class_labels'] == 3]

In [None]:
df3.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,class_labels,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
4,6,2531,42,1.0,2.0,self_employed,1494.0,9.741145,388,58.0,9.794103,9.820199,9.833243,9.845388,9.740988,9.782145,9.775007,9.827581,9.80682,9.835057,1,3,0,0,0,1,0
5,7,263,42,0.0,0.0,self_employed,1096.0,9.741086,1666,60.0,10.37979,10.406059,10.380072,10.335232,9.74099,9.74099,9.790179,9.757656,10.395725,10.384362,0,3,0,0,0,1,0
6,8,5922,72,1.0,0.0,retired,1020.0,9.741027,1,98.0,10.086098,10.115133,10.121005,10.248224,9.741006,9.741006,9.814637,9.766479,10.088973,10.116822,0,3,0,1,0,0,0
7,9,1145,46,1.0,0.0,self_employed,623.0,9.741086,317,172.0,10.207163,10.147199,10.065259,10.413738,9.740985,9.740985,9.766701,10.029611,10.148934,10.013106,0,3,0,0,0,1,0
8,10,2132,31,1.0,0.0,salaried,1096.0,9.741086,4110,19.0,9.817704,9.817704,9.836409,9.80511,9.782139,9.748066,9.782139,9.761892,9.808267,9.844662,0,3,0,0,1,0,0


In [None]:
df4 = df[df['class_labels'] == 4]

In [None]:
df4.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,class_labels,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
1,2,310,35,1.0,0.0,self_employed,1020.0,9.741086,3214,60.0,10.016386,10.154428,10.118569,10.28941,9.741002,9.741002,10.02066,9.746866,10.064619,10.157649,0,4,0,0,0,1,0
2,4,2356,31,1.0,0.0,salaried,146.0,9.741086,41,999.0,9.948134,10.035186,9.994706,9.894242,9.741005,9.741005,10.045279,9.756102,9.999083,10.001981,0,4,0,0,1,0,0
9,11,3379,40,1.0,3.0,self_employed,1020.0,9.741086,38,0.0,9.996885,9.963848,9.861025,9.845028,10.082148,9.962046,9.846245,9.817322,9.842843,9.873888,0,4,0,0,0,1,0
11,13,7108,32,1.0,0.0,salaried,1096.0,9.741027,89,20.0,10.081065,10.141427,10.171583,10.545539,10.109059,9.978142,10.200176,10.006133,10.147726,10.12952,0,4,0,0,1,0,0
12,14,2438,73,1.0,0.0,retired,44.0,9.741145,409,47.0,9.773059,10.031382,9.841314,9.74901,9.740988,9.769975,10.003283,9.770443,9.880281,9.883927,1,4,0,1,0,0,0



#C1 - Decision Tree

In [None]:
df1_org = df1.copy()
df1 = df1.drop(columns = all_cols, axis = 1)
df1 = df1.merge(scaled,left_index=True,right_index=True,how = "left")
y1_all = df1.churn
x1_all=df1.drop(['churn','customer_id','occupation'],axis = 1)

In [None]:
from sklearn.model_selection import train_test_split as tts
trainX1,testX1,trainY1,testY1=tts(x1_all,y1_all,random_state=50,stratify=y1_all)

trainX1.shape, testX1.shape, trainY1.shape, testY1.shape

((5448, 24), (1816, 24), (5448,), (1816,))

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC

In [None]:
Dtc_clf=DTC(max_depth=4)
Dtc_clf.fit(trainX1,trainY1)
Dtc_score_train=Dtc_clf.score(trainX1,trainY1)
Dtc_score_test=Dtc_clf.score(testX1,testY1)
Dtc_pred=Dtc_clf.predict(testX1)
Dtc_predP = Dtc_clf.predict_proba(testX1)[:,1]
Dtc_score_train, Dtc_score_test

(0.9208883994126285, 0.9091409691629956)

#C2 - Support Vector Machine

In [None]:
df2_org = df2.copy()
df2 = df2.drop(columns = all_cols, axis = 1)
df2 = df2.merge(scaled,left_index=True,right_index=True,how = "left")
y2_all = df2.churn
x2_all=df2.drop(['churn','customer_id','occupation'],axis = 1)

In [None]:
from sklearn.model_selection import train_test_split as tts
trainX2,testX2,trainY2,testY2=tts(x2_all,y2_all,random_state=50,stratify=y2_all)

trainX2.shape, testX2.shape, trainY2.shape, testY2.shape

((5195, 24), (1732, 24), (5195,), (1732,))

In [None]:
from sklearn.svm import SVC

In [None]:
SVM_clf=SVC(probability=True)
SVM_clf.fit(trainX2,trainY2)
SVM_score_train=SVM_clf.score(trainX2,trainY2)
SVM_score_test=SVM_clf.score(testX2,testY2)
SVM_pred=SVM_clf.predict(testX2)
SVM_score_train, SVM_score_test

(0.8843118383060635, 0.8839491916859122)

#C3 - Random Forest

In [None]:
df3_org = df3.copy()
df3 = df3.drop(columns = all_cols, axis = 1)
df3 = df3.merge(scaled,left_index=True,right_index=True,how = "left")
y3_all = df3.churn
x3_all=df3.drop(['churn','customer_id','occupation'],axis = 1)

In [None]:
from sklearn.model_selection import train_test_split as tts
trainX3,testX3,trainY3,testY3=tts(x3_all,y3_all,random_state=50,stratify=y3_all)

trainX3.shape, testX3.shape, trainY3.shape, testY3.shape

((5321, 24), (1774, 24), (5321,), (1774,))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFC_clf = RandomForestClassifier(max_depth=5, random_state=0) #maxdepth is the depth of each tree in forest, random state is used for reproducing the problem the same every time it is run
RFC_clf.fit(trainX3,trainY3)
RFC_score_train=RFC_clf.score(trainX3,trainY3)
RFC_score_test=RFC_clf.score(testX3,testY3)
RFC_pred=RFC_clf.predict(testX3)
RFC_predP = RFC_clf.predict_proba(testX3)[:,1]
RFC_score_train, RFC_score_test

(0.8654388272881037, 0.8607666290868095)

#C4 - Logistic Regression

In [None]:
df4_org = df4.copy()
df4 = df4.drop(columns = all_cols, axis = 1)
df4 = df4.merge(scaled,left_index=True,right_index=True,how = "left")
y4_all = df4.churn
x4_all=df4.drop(['churn','customer_id','occupation'],axis = 1)

In [None]:
from sklearn.model_selection import train_test_split as tts
trainX4,testX4,trainY4,testY4=tts(x4_all,y4_all,random_state=50,stratify=y4_all)

trainX4.shape, testX4.shape, trainY4.shape, testY4.shape

((5322, 24), (1774, 24), (5322,), (1774,))

In [None]:
from sklearn.linear_model import LogisticRegression as LR

In [None]:
LR_clf=LR(max_iter=1000)
LR_clf.fit(trainX4,trainY4)
LR_str=LR_clf.score(trainX4,trainY4)
LR_ste=LR_clf.score(testX4,testY4)
LR_pred=LR_clf.predict(testX4)
LR_predP = LR_clf.predict_proba(testX4)[:,1]
LR_str,  LR_ste

(0.7482149567831642, 0.7519729425028185)

#Overall Accuracy

In [None]:
wavg_accuracy = ((len(testX1)*Dtc_score_test) + (len(testX2)*SVM_score_test) + (len(testX3)*RFC_score_test) + (len(testX4)*LR_ste))/(len(testX1)+len(testX2)+len(testX3)+len(testX4))
wavg_accuracy

0.8716065388951522

#Voting Classifier on the entire dataset - Soft

In [None]:
df_org = df.copy()
df = df.drop(columns = all_cols, axis = 1)
df = df.merge(scaled,left_index=True,right_index=True,how = "left")
y_all = df.churn
x_all=df.drop(['churn','customer_id','occupation'],axis = 1)

In [None]:
from sklearn.model_selection import train_test_split as tts
trainX,testX,trainY,testY=tts(x_all,y_all,random_state=50,stratify=y_all)

trainX.shape, testX.shape, trainY.shape, testY.shape

((21286, 24), (7096, 24), (21286,), (7096,))

In [None]:
testX.head()

Unnamed: 0,gender,city,branch_code,days_since_last_transaction,class_labels,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student,customer_nw_category,current_balance,previous_month_end_balance,average_monthly_balance_prevQ2,average_monthly_balance_prevQ,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,vintage,age,dependents
14139,1.0,656.0,46,63.0,3,0,1,0,0,0,-0.34147,-0.682758,-0.650725,-0.560995,-0.650881,-0.313944,-0.3245,-0.308119,-0.346533,-0.705405,-0.678803,2.338782,1.448413,-0.330877
27991,1.0,253.0,1960,277.0,3,0,0,0,1,0,1.172658,0.661156,0.687631,0.853421,0.700483,-0.313878,-0.324436,-0.302866,-0.31055,0.662177,0.698108,-1.108217,-0.573281,4.886317
24435,1.0,1020.0,1211,48.0,4,0,1,0,0,0,1.172658,1.19645,1.878724,0.733826,1.018393,-0.313828,-0.321502,0.961089,-0.377288,1.41383,1.590816,-0.534339,1.055305,-0.330877
23372,0.0,1020.0,544,152.0,1,0,0,0,1,0,-0.34147,-0.447748,-0.472355,-0.302354,-0.515445,-0.313961,-0.324517,-0.36828,-0.377421,-0.487967,-0.502952,0.600997,-0.236332,-0.330877
1799,1.0,1477.0,1739,14.0,4,0,0,1,0,0,-0.34147,0.361305,0.458624,-0.762899,-0.020409,0.59991,1.280509,0.059573,-0.153286,0.522185,0.11495,-0.378447,0.662198,-0.330877


In [None]:
from sklearn.ensemble import VotingClassifier
scores = [Dtc_score_test, SVM_score_test, RFC_score_test, LR_ste]
v_clf = VotingClassifier(estimators=[('Dtc_clf', Dtc_clf),('SVM_clf', SVM_clf),('RFC_clf', RFC_clf),('LR_clf', LR_clf)], voting='soft', weights=scores)
v_clf.fit(trainX,trainY)
score_train=v_clf.score(trainX,trainY)
score_test=v_clf.score(testX,testY)
pred=v_clf.predict(testX)

In [None]:
score_train, score_test

(0.8677402987879358, 0.8665332581736189)