In [None]:
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
import missingno as msno

portfolio = pd.read_csv("churn_sample.csv", index_col=0)
pd.set_option('display.max_rows', 25)

In [None]:
# Exploring the portfolio using pandas tools to see how many unique values per column.
portfolio.loc[portfolio['EDUCATION'].isnull()]
portfolio['DATUM'].unique()


# Turning months into a sequence of factors

months = sorted(portfolio['DATUM'].unique().tolist())
mon_dict = []
for i,month in enumerate(months, start=0):
    mon_dict.append([month,i])
mon_dict = dict(mon_dict)

portfolio['DATUM_m'] = portfolio['DATUM'].replace(mon_dict)

In [None]:
# Explore correlation between columns/variables. Looking for a handful of columns that have apparent correlation with the sought variable.

correlation = portfolio.corr()
plt.figure(figsize=(15, 15))  
sns.heatmap(correlation, vmax=1, square=True, annot=True, cmap='cubehelix')  

In [None]:
# Check the fullness of the dataset.

msno.matrix(portfolio_m)

# The key columns that lack values are filled with neutral values (NaN values are not processed by algorithms).

portfolio_m[['N_OUTFLOW','LIM_APPROVED', 'OUTFLOW_BUSS','AGE']] = portfolio_m[['N_OUTFLOW','LIM_APPROVED', 'OUTFLOW_BUSS','AGE']].fillna(0)

# Other variables used in the training of the algorithms have sufficient fullness.

In [None]:
from time import time
import numpy as np

from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

mask_fit = portfolio['DATUM']=='2018-11-30' #Date selection; the last month
mask_active = portfolio['INACTIVE']==np.bool_(False) #Mask of only active clients

# The training set includes all clients, all months except the last one; using parameters with maximum correlation to the target parameter
train_x = portfolio[~mask_fit][['ID_CLIENT','DATUM_m','SEX','NATIONALITY','AGE','N_OUTFLOW','LIM_APPROVED','OUTFLOW_BUSS']].reset_index(drop=True)
train_y = portfolio[~mask_fit]['INACTIVE'].reset_index(drop=True)

# The test set includes active clients (don't do prediction for those already inactive) and last month only.
test_x = portfolio[mask_fit][['ID_CLIENT','DATUM_m','SEX','NATIONALITY','AGE','N_OUTFLOW','LIM_APPROVED','OUTFLOW_BUSS']].reset_index(drop=True)
test_y = portfolio[mask_fit]['INACTIVE'].reset_index(drop=True)


start = time()
model_1 = LogReg()
model_1.fit(train_x, train_y)
score_1 = model_1.score(train_x, train_y) 
prediction_1 = pd.Series(model_1.predict(test_x))
probability_1 = pd.DataFrame(model_1.predict_proba(test_x))
con_mat_1 = confusion_matrix(test_y,prediction_1)
acc_score_1 = accuracy_score(test_y, prediction_1)
class_report_1 = classification_report(test_y, prediction_1)
# print("Finished after %s seconds." % ( round(time() - start,2) ) )

# start = time()
model_2 = RFC(n_estimators = 20)
model_2.fit(train_x, train_y)
score_2 = model_2.score(train_x, train_y)
prediction_2 = pd.Series(model_2.predict(test_x))
probability_2 = pd.DataFrame(model_2.predict_proba(test_x))
con_mat_2 = confusion_matrix(test_y,prediction_2)
acc_score_2 = accuracy_score(test_y, prediction_2)
class_report_2 = classification_report(test_y, prediction_2)
print("Finished after %s seconds." % ( round(time() - start,2) ) )

In [None]:
# Checking results

print(con_mat_2)
accuracy_score(p_test_y, prediction_1)