## Churn Prediction using Logistic Regression

In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve

In [91]:
df = pd.read_csv('churn_prediction.csv')

In [92]:
df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,Male,0.0,self_employed,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,2,310,35,Male,0.0,self_employed,,2,3214,60.0,...,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,4,2356,31,Male,0.0,salaried,146.0,2,41,,...,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,5,478,90,,,self_employed,1020.0,2,582,147.0,...,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,6,2531,42,Male,2.0,self_employed,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1


In [93]:
df.describe(include='all')

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
count,28382.0,28382.0,28382.0,27857,25919.0,28302,27579.0,28382.0,28382.0,25159.0,...,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0,28382.0
unique,,,,2,,5,,,,,...,,,,,,,,,,
top,,,,Male,,self_employed,,,,,...,,,,,,,,,,
freq,,,,16548,,17476,,,,,...,,,,,,,,,,
mean,15143.508667,2364.336446,48.208336,,0.347236,,796.109576,2.22553,925.975019,69.997814,...,7495.771,7496.78,7124.209,3433.252,3261.694,3658.745,3339.761,7451.133,7495.177,0.185329
std,8746.454456,1610.124506,17.807163,,0.997661,,432.872102,0.660443,937.799129,86.341098,...,42529.35,41726.22,44575.81,77071.45,29688.89,51985.42,24301.11,42033.94,42431.98,0.388571
min,1.0,180.0,1.0,,0.0,,0.0,1.0,1.0,0.0,...,-3149.57,1428.69,-16506.1,0.01,0.01,0.01,0.01,-3374.18,-5171.92,0.0
25%,7557.25,1121.0,36.0,,0.0,,409.0,2.0,176.0,11.0,...,1906.0,2180.945,1832.507,0.31,0.33,0.41,0.41,1996.765,2074.408,0.0
50%,15150.5,2018.0,46.0,,0.0,,834.0,2.0,572.0,30.0,...,3379.915,3542.865,3359.6,0.61,0.63,91.93,109.96,3447.995,3465.235,0.0
75%,22706.75,3176.0,60.0,,0.0,,1096.0,3.0,1440.0,95.0,...,6656.535,6666.887,6517.96,707.2725,749.235,1360.435,1357.553,6667.958,6654.693,0.0


In [94]:
df.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
days_since_last_transaction       3223
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
dtype: int64

In [95]:
df['gender'].value_counts()

Male      16548
Female    11309
Name: gender, dtype: int64

In [96]:
gender_dict = {'Male': 1, 'Female':0}
df.replace({'gender': gender_dict}, inplace=True)

df['gender']  = df['gender'].fillna(-1)

In [97]:
df['occupation'] = df['occupation'].fillna(value = df['occupation'].mode()[0])
df['dependents'].fillna(0, inplace=True)

In [98]:
df['city'].fillna(1020, inplace=True)

In [99]:
df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,1.0,0.0,self_employed,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,2,310,35,1.0,0.0,self_employed,1020.0,2,3214,60.0,...,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,4,2356,31,1.0,0.0,salaried,146.0,2,41,,...,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,5,478,90,-1.0,0.0,self_employed,1020.0,2,582,147.0,...,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,6,2531,42,1.0,2.0,self_employed,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1


In [100]:
df['days_since_last_transaction'] = df['days_since_last_transaction'].fillna(999)

### Dummies and scaling

In [101]:
pd.get_dummies(df['occupation']).head()

Unnamed: 0,company,retired,salaried,self_employed,student
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,1,0


In [102]:
df.drop(['customer_id'], axis=1, inplace=True)

In [103]:
df.nunique()

vintage                            5473
age                                  90
gender                                3
dependents                           15
occupation                            5
city                               1604
customer_nw_category                  3
branch_code                        3185
days_since_last_transaction         361
current_balance                   27903
previous_month_end_balance        27922
average_monthly_balance_prevQ     27801
average_monthly_balance_prevQ2    27940
current_month_credit              10411
previous_month_credit             10711
current_month_debit               13704
previous_month_debit              14010
current_month_balance             27944
previous_month_balance            27913
churn                                 2
dtype: int64

In [104]:
df['occupation'].value_counts()

self_employed    17556
salaried          6704
student           2058
retired           2024
company             40
Name: occupation, dtype: int64

In [105]:
le = LabelEncoder()
le.fit_transform(['self_employed','salaried','student','retired','company'])

array([3, 2, 4, 1, 0], dtype=int64)

In [106]:
df['occupation'] = df['occupation'].map({'self_employed':0,
                                        'salaried': 1,
                                        'student': 2,
                                        'retired': 3,
                                        'company': 4})

In [107]:
df['occupation'].head()

0    0
1    0
2    1
3    0
4    0
Name: occupation, dtype: int64

In [108]:
df.head()

Unnamed: 0,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,3135,66,1.0,0.0,0,187.0,2,755,224.0,1458.71,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,310,35,1.0,0.0,0,1020.0,2,3214,60.0,5390.37,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,2356,31,1.0,0.0,1,146.0,2,41,999.0,3913.16,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,478,90,-1.0,0.0,0,1020.0,2,582,147.0,2291.91,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,2531,42,1.0,2.0,0,1494.0,3,388,58.0,927.72,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1


In [109]:
df.dtypes

vintage                             int64
age                                 int64
gender                            float64
dependents                        float64
occupation                          int64
city                              float64
customer_nw_category                int64
branch_code                         int64
days_since_last_transaction       float64
current_balance                   float64
previous_month_end_balance        float64
average_monthly_balance_prevQ     float64
average_monthly_balance_prevQ2    float64
current_month_credit              float64
previous_month_credit             float64
current_month_debit               float64
previous_month_debit              float64
current_month_balance             float64
previous_month_balance            float64
churn                               int64
dtype: object

In [110]:
num_cols = ['customer_nw_category','current_balance','previous_month_end_balance',
           'average_monthly_balance_prevQ2','average_monthly_balance_prevQ',
           'current_month_credit','previous_month_credit','current_month_debit',
           'previous_month_debit','current_month_balance','previous_month_balance']

# scaling the required columns separately

for i in num_cols:
    df[i] = np.log(df[i] + 17000)

std = StandardScaler()
scaled = std.fit_transform(df[num_cols])
scaled

array([[-0.34147009, -0.61373821, -0.63236691, ..., -0.37739037,
        -0.65345493, -0.66620743],
       [-0.34147009,  0.01126671,  0.44645798, ..., -0.355677  ,
         0.14943965,  0.45568047],
       [-0.34147009, -0.20965112,  0.05797536, ..., -0.32160663,
        -0.0685969 , -0.06664232],
       ...,
       [-0.34147009,  4.23304006,  4.06363619, ..., -0.36188631,
         4.14466958,  4.0182815 ],
       [ 1.17265812, -0.58461382, -0.60305203, ..., -0.37739037,
        -0.62351878, -0.63601616],
       [-0.34147009, -0.5020012 , -0.40032652, ..., -0.14734825,
        -0.48753168, -0.35368739]])

In [111]:
scaled = pd.DataFrame(scaled, columns = num_cols)

In [112]:
scaled

Unnamed: 0,customer_nw_category,current_balance,previous_month_end_balance,average_monthly_balance_prevQ2,average_monthly_balance_prevQ,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance
0,-0.341470,-0.613738,-0.632367,-0.630646,-0.697612,-0.313931,-0.324487,-0.368251,-0.377390,-0.653455,-0.666207
1,-0.341470,0.011267,0.446458,0.949414,0.317595,-0.313852,-0.324410,0.640986,-0.355677,0.149440,0.455680
2,-0.341470,-0.209651,0.057975,-0.388637,-0.108263,-0.313841,-0.324400,0.729825,-0.321607,-0.068597,-0.066642
3,-0.341470,-0.470836,-0.488530,-0.712855,-0.582976,-0.313872,-0.324429,-0.368193,0.060593,-0.506570,-0.628056
4,1.172658,-0.708214,-0.642441,-0.554058,-0.663399,-0.313902,-0.175104,-0.245463,-0.057927,-0.708257,-0.626732
...,...,...,...,...,...,...,...,...,...,...,...
28377,-0.341470,-0.681476,-0.700548,-0.393466,-0.547552,-0.313909,-0.324466,-0.368230,-0.377369,-0.723081,-0.736426
28378,-0.341470,-0.220357,-0.201417,-0.213875,-0.308790,-0.313601,-0.324041,-0.181936,-0.163698,-0.266001,-0.283262
28379,-0.341470,4.233040,4.063636,1.897512,3.907073,0.585389,0.422031,-0.332759,-0.361886,4.144670,4.018282
28380,1.172658,-0.584614,-0.603052,-0.556517,-0.656050,-0.313931,-0.324487,-0.368251,-0.377390,-0.623519,-0.636016


In [113]:
df_orig = df.copy()

In [114]:
df = df.drop(columns = num_cols, axis = 1)
df = df.merge(scaled, left_index = True, right_index = True, how = 'left')
df

Unnamed: 0,vintage,age,gender,dependents,occupation,city,branch_code,days_since_last_transaction,churn,customer_nw_category,current_balance,previous_month_end_balance,average_monthly_balance_prevQ2,average_monthly_balance_prevQ,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance
0,3135,66,1.0,0.0,0,187.0,755,224.0,0,-0.341470,-0.613738,-0.632367,-0.630646,-0.697612,-0.313931,-0.324487,-0.368251,-0.377390,-0.653455,-0.666207
1,310,35,1.0,0.0,0,1020.0,3214,60.0,0,-0.341470,0.011267,0.446458,0.949414,0.317595,-0.313852,-0.324410,0.640986,-0.355677,0.149440,0.455680
2,2356,31,1.0,0.0,1,146.0,41,999.0,0,-0.341470,-0.209651,0.057975,-0.388637,-0.108263,-0.313841,-0.324400,0.729825,-0.321607,-0.068597,-0.066642
3,478,90,-1.0,0.0,0,1020.0,582,147.0,1,-0.341470,-0.470836,-0.488530,-0.712855,-0.582976,-0.313872,-0.324429,-0.368193,0.060593,-0.506570,-0.628056
4,2531,42,1.0,2.0,0,1494.0,388,58.0,1,1.172658,-0.708214,-0.642441,-0.554058,-0.663399,-0.313902,-0.175104,-0.245463,-0.057927,-0.708257,-0.626732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28377,1845,10,0.0,0.0,2,1020.0,1207,70.0,0,-0.341470,-0.681476,-0.700548,-0.393466,-0.547552,-0.313909,-0.324466,-0.368230,-0.377369,-0.723081,-0.736426
28378,4919,34,0.0,0.0,0,1046.0,223,14.0,0,-0.341470,-0.220357,-0.201417,-0.213875,-0.308790,-0.313601,-0.324041,-0.181936,-0.163698,-0.266001,-0.283262
28379,297,47,1.0,0.0,1,1096.0,588,0.0,1,-0.341470,4.233040,4.063636,1.897512,3.907073,0.585389,0.422031,-0.332759,-0.361886,4.144670,4.018282
28380,2585,50,1.0,3.0,0,1219.0,274,999.0,0,1.172658,-0.584614,-0.603052,-0.556517,-0.656050,-0.313931,-0.324487,-0.368251,-0.377390,-0.623519,-0.636016


In [115]:
target = df['churn']
df = df.drop(['churn'], axis = 1)

### Model Building

## Baseline Model

In [116]:
from sklearn.metrics import recall_score

In [117]:
baseline_cols = ['current_month_debit', 'previous_month_debit','current_balance','previous_month_end_balance','vintage','occupation']

In [118]:
df_baseline = df[baseline_cols]

In [125]:
# Splitting the data into Train and Test sets
xtrain, xtest, ytrain, ytest = train_test_split(df_baseline, target,test_size=0.6, random_state=42, stratify = target)

In [126]:
model = LogisticRegression()
model.fit(xtrain, ytrain)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [127]:
train_predict = model.predict(xtrain)
train_predict

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [131]:
score = f1_score(train_predict, ytrain)
print('Training f1_score', score )

Training f1_score 0.21505376344086022


In [129]:
# Now, trying on original data
# Splitting the data into Train and Test sets
xtrain, xtest, ytrain, ytest = train_test_split(df_orig, target,test_size=0.6, random_state=42, stratify = target)

In [130]:
model.fit(xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [132]:
train_predict = model.predict(xtrain)
train_predict

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [133]:
score = f1_score(train_predict, ytrain)
print('Training f1_score on original data', score )

Training f1_score on original data 0.9992865636147443


In [134]:
# Prediction on test data
test_predict = model.predict(xtest)
test_predict
test_score = f1_score(test_predict, ytest)
print('Test f1_score on original data', test_score)

Test f1_score on original data 0.9996830427892235


## Train and Test scores on different test_size values

In [148]:
i = 0.9
while(i>0.1):
    xtrain, xtest, ytrain, ytest = train_test_split(df_orig, target, test_size = i , random_state = 42, stratify = target)

    print('test_size = ', i)
    model.fit(xtrain, ytrain)

    train_predict = model.predict(xtrain)
    score_train = f1_score(train_predict, ytrain)
    print('Training f1_score', score_train)

    test_predict = model.predict(xtest)
    score_test = f1_score(test_predict, ytest)
    print('Test f1_score', score_test)

    print('*********************************')
    
    i -= 0.1

test_size =  0.9
Training f1_score 0.9971509971509972
Test f1_score 0.9961945031712472
*********************************
test_size =  0.8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training f1_score 0.9952471482889734
Test f1_score 0.9936927287873378
*********************************
test_size =  0.7000000000000001
Training f1_score 0.9933523266856601
Test f1_score 0.9915783754414562
*********************************
test_size =  0.6000000000000001


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training f1_score 0.9992865636147443
Test f1_score 0.9996830427892235
*********************************
test_size =  0.5000000000000001


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training f1_score 0.9967699030970929
Test f1_score 0.9977203647416413
*********************************
test_size =  0.40000000000000013


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training f1_score 0.9973088491372487
Test f1_score 0.997624703087886
*********************************
test_size =  0.30000000000000016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training f1_score 0.9975563399402662
Test f1_score 0.9984142086901364
*********************************
test_size =  0.20000000000000015


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training f1_score 0.9976258309591644
Test f1_score 0.997624703087886
*********************************
test_size =  0.10000000000000014
Training f1_score 0.9971497941517997
Test f1_score 0.9980988593155894
*********************************


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
