In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, precision_score, recall_score
import warnings
warnings.simplefilter(action='ignore', category= FutureWarning)
warnings.simplefilter(action='ignore', category= UserWarning)

In [2]:
df = pd.read_csv('churn_prediction.csv')

In [3]:
df.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
days_since_last_transaction       3223
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
dtype: int64

In [4]:
df['gender'].value_counts()

Male      16548
Female    11309
Name: gender, dtype: int64

In [5]:
dict_gender = {'Male': 1, 'Female': 0}
df.replace({'gender': dict_gender}, inplace = True)

df['gender'] = df['gender'].fillna(-1)

In [6]:
df['gender'].value_counts()

 1.0    16548
 0.0    11309
-1.0      525
Name: gender, dtype: int64

In [7]:
df['dependents'].value_counts()

0.0     21435
2.0      2150
1.0      1395
3.0       701
4.0       179
5.0        41
6.0         8
7.0         3
52.0        1
25.0        1
8.0         1
50.0        1
9.0         1
36.0        1
32.0        1
Name: dependents, dtype: int64

In [8]:
df['dependents']=df['dependents'].fillna(0)

In [9]:
df['dependents'].isnull().sum()

0

In [10]:
df['occupation'].value_counts()

self_employed    17476
salaried          6704
student           2058
retired           2024
company             40
Name: occupation, dtype: int64

In [11]:
df['occupation']=df['occupation'].fillna('self_employed')

In [12]:
df['occupation'].isnull().sum()

0

In [13]:
df['city'].value_counts()

1020.0    3479
1096.0    2016
409.0     1334
146.0     1291
834.0     1138
          ... 
1601.0       1
1093.0       1
1313.0       1
1391.0       1
1174.0       1
Name: city, Length: 1604, dtype: int64

In [14]:
df['city']=df['city'].fillna(1020)

In [15]:
df['city'].isnull().sum()

0

In [16]:
df['days_since_last_transaction']=df['days_since_last_transaction'].fillna(999)

In [17]:
df.isnull().sum()

customer_id                       0
vintage                           0
age                               0
gender                            0
dependents                        0
occupation                        0
city                              0
customer_nw_category              0
branch_code                       0
days_since_last_transaction       0
current_balance                   0
previous_month_end_balance        0
average_monthly_balance_prevQ     0
average_monthly_balance_prevQ2    0
current_month_credit              0
previous_month_credit             0
current_month_debit               0
previous_month_debit              0
current_month_balance             0
previous_month_balance            0
churn                             0
dtype: int64

In [18]:
df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,1.0,0.0,self_employed,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,2,310,35,1.0,0.0,self_employed,1020.0,2,3214,60.0,...,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,4,2356,31,1.0,0.0,salaried,146.0,2,41,999.0,...,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,5,478,90,-1.0,0.0,self_employed,1020.0,2,582,147.0,...,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,6,2531,42,1.0,2.0,self_employed,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1


In [19]:
df.dtypes

customer_id                         int64
vintage                             int64
age                                 int64
gender                            float64
dependents                        float64
occupation                         object
city                              float64
customer_nw_category                int64
branch_code                         int64
days_since_last_transaction       float64
current_balance                   float64
previous_month_end_balance        float64
average_monthly_balance_prevQ     float64
average_monthly_balance_prevQ2    float64
current_month_credit              float64
previous_month_credit             float64
current_month_debit               float64
previous_month_debit              float64
current_month_balance             float64
previous_month_balance            float64
churn                               int64
dtype: object

In [20]:
df = pd.concat([df,pd.get_dummies(df['occupation'], prefix=str('occupation'), prefix_sep='_')],axis=1)

In [21]:
df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
0,1,3135,66,1.0,0.0,self_employed,187.0,2,755,224.0,...,0.2,0.2,1458.71,1458.71,0,0,0,0,1,0
1,2,310,35,1.0,0.0,self_employed,1020.0,2,3214,60.0,...,5486.27,100.56,6496.78,8787.61,0,0,0,0,1,0
2,4,2356,31,1.0,0.0,salaried,146.0,2,41,999.0,...,6046.73,259.23,5006.28,5070.14,0,0,0,1,0,0
3,5,478,90,-1.0,0.0,self_employed,1020.0,2,582,147.0,...,0.47,2143.33,2291.91,1669.79,1,0,0,0,1,0
4,6,2531,42,1.0,2.0,self_employed,1494.0,3,388,58.0,...,588.62,1538.06,1157.15,1677.16,1,0,0,0,1,0


In [22]:
df.drop(columns='occupation', inplace=True)

In [23]:
df.columns

Index(['customer_id', 'vintage', 'age', 'gender', 'dependents', 'city',
       'customer_nw_category', 'branch_code', 'days_since_last_transaction',
       'current_balance', 'previous_month_end_balance',
       'average_monthly_balance_prevQ', 'average_monthly_balance_prevQ2',
       'current_month_credit', 'previous_month_credit', 'current_month_debit',
       'previous_month_debit', 'current_month_balance',
       'previous_month_balance', 'churn', 'occupation_company',
       'occupation_retired', 'occupation_salaried', 'occupation_self_employed',
       'occupation_student'],
      dtype='object')

In [24]:
df.describe()

Unnamed: 0,customer_id,vintage,age,gender,dependents,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,...,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
count,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,...,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0
mean,15143.508667,2364.336446,48.208336,0.564548,0.317102,802.444014,2.22553,925.975019,175.493341,7380.552,...,3658.745,3339.761,7451.133,7495.177,0.185329,0.001409,0.071313,0.236206,0.618561,0.072511
std,8746.454456,1610.124506,17.807163,0.531826,0.958386,428.316238,0.660443,937.799129,305.757315,42598.71,...,51985.42,24301.11,42033.94,42431.98,0.388571,0.037515,0.257351,0.424758,0.485748,0.259336
min,1.0,180.0,1.0,-1.0,0.0,0.0,1.0,1.0,0.0,-5503.96,...,0.01,0.01,-3374.18,-5171.92,0.0,0.0,0.0,0.0,0.0,0.0
25%,7557.25,1121.0,36.0,0.0,0.0,409.0,2.0,176.0,13.0,1784.47,...,0.41,0.41,1996.765,2074.407,0.0,0.0,0.0,0.0,0.0,0.0
50%,15150.5,2018.0,46.0,1.0,0.0,879.0,2.0,572.0,41.0,3281.255,...,91.93,109.96,3447.995,3465.235,0.0,0.0,0.0,0.0,1.0,0.0
75%,22706.75,3176.0,60.0,1.0,0.0,1096.0,3.0,1440.0,154.0,6635.82,...,1360.435,1357.553,6667.958,6654.693,0.0,0.0,0.0,0.0,1.0,0.0
max,30301.0,12899.0,90.0,1.0,52.0,1649.0,3.0,4782.0,999.0,5905904.0,...,7637857.0,1414168.0,5778185.0,5720144.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
num_cols = ['customer_nw_category', 'current_balance',
            'previous_month_end_balance', 'average_monthly_balance_prevQ2', 'average_monthly_balance_prevQ',
            'current_month_credit','previous_month_credit', 'current_month_debit',
            'previous_month_debit','current_month_balance', 'previous_month_balance']
for i in num_cols:
    df[i] = np.log(df[i] + 17000)


In [26]:
std = StandardScaler()
scaled_df = std.fit_transform(df[num_cols])
scaled_df = pd.DataFrame(scaled_df,columns=num_cols)

In [27]:
df.drop(columns=num_cols, axis=1, inplace=True)
df = df.merge(scaled_df, left_index = True,right_index = True,how = "left",)

In [28]:
y = df['churn']
df = df.drop(['churn','customer_id'],axis = 1)

In [29]:
df.head()

Unnamed: 0,vintage,age,gender,dependents,city,branch_code,days_since_last_transaction,occupation_company,occupation_retired,occupation_salaried,...,current_balance,previous_month_end_balance,average_monthly_balance_prevQ2,average_monthly_balance_prevQ,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance
0,3135,66,1.0,0.0,187.0,755,224.0,0,0,0,...,-0.613738,-0.632367,-0.630646,-0.697612,-0.313931,-0.324487,-0.368251,-0.37739,-0.653455,-0.666207
1,310,35,1.0,0.0,1020.0,3214,60.0,0,0,0,...,0.011267,0.446458,0.949414,0.317595,-0.313852,-0.32441,0.640986,-0.355677,0.14944,0.45568
2,2356,31,1.0,0.0,146.0,41,999.0,0,0,1,...,-0.209651,0.057975,-0.388637,-0.108263,-0.313841,-0.3244,0.729825,-0.321607,-0.068597,-0.066642
3,478,90,-1.0,0.0,1020.0,582,147.0,0,0,0,...,-0.470836,-0.48853,-0.712855,-0.582976,-0.313872,-0.324429,-0.368193,0.060593,-0.50657,-0.628056
4,2531,42,1.0,2.0,1494.0,388,58.0,0,0,0,...,-0.708214,-0.642441,-0.554058,-0.663399,-0.313902,-0.175104,-0.245463,-0.057927,-0.708257,-0.626732


In [30]:
y.head()

0    0
1    0
2    0
3    1
4    1
Name: churn, dtype: int64

In [31]:
df.corr()

Unnamed: 0,vintage,age,gender,dependents,city,branch_code,days_since_last_transaction,occupation_company,occupation_retired,occupation_salaried,...,current_balance,previous_month_end_balance,average_monthly_balance_prevQ2,average_monthly_balance_prevQ,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance
vintage,1.0,0.22739,0.067497,0.068588,0.04172,-0.380714,-0.034482,0.035027,0.057813,0.041924,...,0.044727,0.043424,0.05729,0.04704,0.042409,0.056022,0.039448,0.05103,0.043306,0.044127
age,0.22739,1.0,-0.07859,0.001261,0.014564,-0.05899,-0.020005,0.045605,0.372942,-0.129008,...,0.109658,0.112509,0.126854,0.128603,0.045523,0.054294,0.043507,0.046206,0.117852,0.121759
gender,0.067497,-0.07859,1.0,0.185064,0.000875,0.072436,-0.066755,-0.001028,0.126237,0.16803,...,-0.00117,0.003504,0.010135,0.001434,0.033498,0.036754,0.035937,0.039821,-0.000936,0.001765
dependents,0.068588,0.001261,0.185064,1.0,0.003287,-0.001704,-0.088345,-0.00459,-0.054831,0.044848,...,0.004143,0.00582,0.005923,0.009285,0.057506,0.056666,0.064352,0.063384,0.006545,0.007149
city,0.04172,0.014564,0.000875,0.003287,1.0,-0.055779,-0.004962,0.002009,-0.004232,0.003649,...,-0.008158,-0.008918,-0.004918,-0.008007,-0.001284,0.002009,-0.006923,-0.001424,-0.00904,-0.007256
branch_code,-0.380714,-0.05899,0.072436,-0.001704,-0.055779,1.0,-0.022233,-0.008184,-0.004395,-0.03318,...,-0.014419,-0.010512,-0.010794,-0.012047,-0.030728,-0.041511,-0.024769,-0.028065,-0.014148,-0.008413
days_since_last_transaction,-0.034482,-0.020005,-0.066755,-0.088345,-0.004962,-0.022233,1.0,0.004372,0.002313,-0.042554,...,-0.047392,-0.039362,-0.011705,-0.049151,-0.170574,-0.165961,-0.154007,-0.15427,-0.051445,-0.040958
occupation_company,0.035027,0.045605,-0.001028,-0.00459,0.002009,-0.008184,0.004372,1.0,-0.01041,-0.020892,...,0.003582,-0.002964,-0.007821,0.000906,0.004386,0.001742,-0.003411,-0.008765,-0.000535,-0.001969
occupation_retired,0.057813,0.372942,0.126237,-0.054831,-0.004232,-0.004395,0.002313,-0.01041,1.0,-0.154101,...,-0.008067,-0.007571,-0.005258,-0.008849,-0.017102,-0.008453,-0.023293,-0.017127,-0.007817,-0.008177
occupation_salaried,0.041924,-0.129008,0.16803,0.044848,0.003649,-0.03318,-0.042554,-0.020892,-0.154101,1.0,...,0.020842,0.02155,0.019243,0.014934,-0.00687,0.003152,-0.003491,0.002199,0.014916,0.016192


In [35]:
x_cols = ['current_month_debit', 'previous_month_debit','current_balance','previous_month_end_balance','vintage','occupation_retired', 'occupation_salaried','occupation_self_employed', 'occupation_student']

In [39]:
x =df[x_cols]

In [40]:
x.head()

Unnamed: 0,current_month_debit,previous_month_debit,current_balance,previous_month_end_balance,vintage,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
0,-0.368251,-0.37739,-0.613738,-0.632367,3135,0,0,1,0
1,0.640986,-0.355677,0.011267,0.446458,310,0,0,1,0
2,0.729825,-0.321607,-0.209651,0.057975,2356,0,1,0,0
3,-0.368193,0.060593,-0.470836,-0.48853,478,0,0,1,0
4,-0.245463,-0.057927,-0.708214,-0.642441,2531,0,0,1,0


In [42]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size =0.25, random_state=11, stratify = y)

In [45]:
LRModel = LogisticRegression()
LRModel.fit(train_x,train_y)
pred = LRModel.predict_proba(test_x)

In [48]:
pred_val = LRModel.predict(test_x)

In [49]:
recall_score(test_y,pred_val)

0.11406844106463879