In [10]:
# importing required libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Merge, Flatten, Dropout, Input, Reshape, merge
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l1, l2, l1_l2
from keras import utils as np_utils

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.utils import shuffle

from imblearn.over_sampling import RandomOverSampler

from collections import OrderedDict

In [11]:
Production = True
Rebalance = True
Eliminate_Outlier = True
cross_train = False

In [12]:
if Production == True:
    # reading the training dataset
    traindf = pd.read_csv('C:\\Users\\admin\\Downloads\\New folder\\train.csv',encoding='iso8859_2',low_memory=False)
    # creating flag column for train
    traindf['DataType'] = 'Train'
    # reading the test dataset
    testdf = pd.read_csv('C:\\Users\\admin\\Downloads\\New folder\\test.csv',encoding='iso8859_2',low_memory=False)
    # savings list of ids of testdf for prediction in later stages
    test_ids = testdf['UCIC_ID']
    # creating the target variable column in test as 'N'
    testdf['RESPONDERS'] = 'N'
    # creating flag column for test
    testdf['DataType'] = 'Test'
    # combining train and test
    df = (pd.concat([traindf.reset_index(drop = True),testdf.reset_index(drop = True)],axis=0))
    del traindf
    del testdf
else:
    df = pd.read_csv('C:\\Users\\admin\\Downloads\\New folder\\train.csv',encoding='iso8859_2',low_memory=False)

In [13]:
# subsetting variables by continous/categorical datatypes
cont_vars = ['NO_OF_Accs','dependents','C_prev1','D_prev1','ATM_C_prev1','ATM_D_prev1','BRANCH_C_prev1',
             'BRANCH_D_prev1','IB_C_prev1','IB_D_prev1','MB_C_prev1','MB_D_prev1','POS_C_prev1','POS_D_prev1','count_C_prev1',
             'count_D_prev1','COUNT_ATM_C_prev1','COUNT_ATM_D_prev1','COUNT_BRANCH_C_prev1','COUNT_BRANCH_D_prev1',
             'COUNT_IB_C_prev1','COUNT_IB_D_prev1','COUNT_MB_C_prev1','COUNT_MB_D_prev1','COUNT_POS_C_prev1','COUNT_POS_D_prev1',
             'custinit_CR_amt_prev1','custinit_DR_amt_prev1','custinit_CR_cnt_prev1','custinit_DR_cnt_prev1','ATM_amt_prev1',
             'ATM_CW_Amt_prev1','ATM_CW_Cnt_prev1','BRN_CW_Amt_prev1','BRN_CW_Cnt_prev1','BRN_CASH_Dep_Amt_prev1',
             'BRN_CASH_Dep_Cnt_prev1','CNR_prev1','BAL_prev1','EOP_prev1','CR_AMB_Prev1','C_prev2','D_prev2','ATM_C_prev2',
             'ATM_D_prev2','BRANCH_C_prev2','BRANCH_D_prev2','IB_C_prev2','IB_D_prev2','MB_C_prev2','MB_D_prev2','POS_C_prev2',
             'POS_D_prev2','count_C_prev2','count_D_prev2','COUNT_ATM_C_prev2','COUNT_ATM_D_prev2','COUNT_BRANCH_C_prev2',
             'COUNT_BRANCH_D_prev2','COUNT_IB_C_prev2','COUNT_IB_D_prev2','COUNT_MB_C_prev2','COUNT_MB_D_prev2',
             'COUNT_POS_C_prev2','COUNT_POS_D_prev2','custinit_CR_amt_prev2','custinit_DR_amt_prev2','custinit_CR_cnt_prev2',
             'custinit_DR_cnt_prev2','ATM_amt_prev2','ATM_CW_Amt_prev2','ATM_CW_Cnt_prev2','BRN_CW_Amt_prev2','BRN_CW_Cnt_prev2',
             'BRN_CASH_Dep_Amt_prev2','BRN_CASH_Dep_Cnt_prev2','CNR_prev2','BAL_prev2','EOP_prev2','CR_AMB_Prev2','C_prev3',
             'D_prev3','ATM_C_prev3','ATM_D_prev3','BRANCH_C_prev3','BRANCH_D_prev3','IB_C_prev3','IB_D_prev3','MB_C_prev3',
             'MB_D_prev3','POS_C_prev3','POS_D_prev3','count_C_prev3','count_D_prev3','COUNT_ATM_C_prev3','COUNT_ATM_D_prev3',
             'COUNT_BRANCH_C_prev3','COUNT_BRANCH_D_prev3','COUNT_IB_C_prev3','COUNT_IB_D_prev3','COUNT_MB_C_prev3',
             'COUNT_MB_D_prev3','COUNT_POS_C_prev3','COUNT_POS_D_prev3','custinit_CR_amt_prev3','custinit_DR_amt_prev3',
             'custinit_CR_cnt_prev3','custinit_DR_cnt_prev3','ATM_amt_prev3','ATM_CW_Amt_prev3','ATM_CW_Cnt_prev3',
             'BRN_CW_Amt_prev3','BRN_CW_Cnt_prev3','BRN_CASH_Dep_Amt_prev3','BRN_CASH_Dep_Cnt_prev3','CNR_prev3','BAL_prev3',
             'EOP_prev3','CR_AMB_Prev3','C_prev4','D_prev4','ATM_C_prev4','ATM_D_prev4','BRANCH_C_prev4','BRANCH_D_prev4',
             'IB_C_prev4','IB_D_prev4','MB_C_prev4','MB_D_prev4','POS_C_prev4','POS_D_prev4','count_C_prev4','count_D_prev4',
             'COUNT_ATM_C_prev4','COUNT_ATM_D_prev4','COUNT_BRANCH_C_prev4','COUNT_BRANCH_D_prev4','COUNT_IB_C_prev4',
             'COUNT_IB_D_prev4','COUNT_MB_C_prev4','COUNT_MB_D_prev4','COUNT_POS_C_prev4','COUNT_POS_D_prev4',
             'custinit_CR_amt_prev4','custinit_DR_amt_prev4','custinit_CR_cnt_prev4','custinit_DR_cnt_prev4','ATM_amt_prev4',
             'ATM_CW_Amt_prev4','ATM_CW_Cnt_prev4','BRN_CW_Amt_prev4','BRN_CW_Cnt_prev4','BRN_CASH_Dep_Amt_prev4',
             'BRN_CASH_Dep_Cnt_prev4','CNR_prev4','BAL_prev4','EOP_prev4','CR_AMB_Prev4','C_prev5','D_prev5','ATM_C_prev5',
             'ATM_D_prev5','BRANCH_C_prev5','BRANCH_D_prev5','IB_C_prev5','IB_D_prev5','MB_C_prev5','MB_D_prev5','POS_C_prev5',
             'POS_D_prev5','count_C_prev5','count_D_prev5','COUNT_ATM_C_prev5','COUNT_ATM_D_prev5','COUNT_BRANCH_C_prev5',
             'COUNT_BRANCH_D_prev5','COUNT_IB_C_prev5','COUNT_IB_D_prev5','COUNT_MB_C_prev5','COUNT_MB_D_prev5',
             'COUNT_POS_C_prev5','COUNT_POS_D_prev5','custinit_CR_amt_prev5','custinit_DR_amt_prev5','custinit_CR_cnt_prev5',
             'custinit_DR_cnt_prev5','ATM_amt_prev5','ATM_CW_Amt_prev5','ATM_CW_Cnt_prev5','BRN_CW_Amt_prev5','BRN_CW_Cnt_prev5',
             'BRN_CASH_Dep_Amt_prev5','BRN_CASH_Dep_Cnt_prev5','CNR_prev5','BAL_prev5','EOP_prev5','CR_AMB_Prev5','C_prev6',
             'D_prev6','ATM_C_prev6','ATM_D_prev6','BRANCH_C_prev6','BRANCH_D_prev6','IB_C_prev6','IB_D_prev6','MB_C_prev6',
             'MB_D_prev6','POS_C_prev6','POS_D_prev6','count_C_prev6','count_D_prev6','COUNT_ATM_C_prev6','COUNT_ATM_D_prev6',
             'COUNT_BRANCH_C_prev6','COUNT_BRANCH_D_prev6','COUNT_IB_C_prev6','COUNT_IB_D_prev6','COUNT_MB_C_prev6',
             'COUNT_MB_D_prev6','COUNT_POS_C_prev6','COUNT_POS_D_prev6','custinit_CR_amt_prev6','custinit_DR_amt_prev6',
             'custinit_CR_cnt_prev6','custinit_DR_cnt_prev6','ATM_amt_prev6','ATM_CW_Amt_prev6','ATM_CW_Cnt_prev6',
             'BRN_CW_Amt_prev6','BRN_CW_Cnt_prev6','BRN_CASH_Dep_Amt_prev6','BRN_CASH_Dep_Cnt_prev6','CNR_prev6','BAL_prev6',
             'EOP_prev6','CR_AMB_Prev6','Billpay_Reg_ason_Prev1','FD_AMOUNT_BOOK_PrevQ1','FD_AMOUNT_BOOK_PrevQ2',
             'NO_OF_FD_BOOK_PrevQ1','NO_OF_FD_BOOK_PrevQ2','NO_OF_RD_BOOK_PrevQ1','NO_OF_RD_BOOK_PrevQ2','RD_AMOUNT_BOOK_PrevQ1',
             'RD_AMOUNT_BOOK_PrevQ2','Total_Invest_in_MF_PrevQ1','Total_Invest_in_MF_PrevQ2','count_No_of_MF_PrevQ1',
             'count_No_of_MF_PrevQ2','Dmat_Investing_PrevQ1','Dmat_Investing_PrevQ2','Charges_PrevQ1','Charges_cnt_PrevQ1',
             'NO_OF_COMPLAINTS','CASH_WD_AMT_Last6','CASH_WD_CNT_Last6','age','Recency_of_CR_TXN','Recency_of_DR_TXN',
             'Recency_of_IB_TXN','Recency_of_ATM_TXN','Recency_of_BRANCH_TXN','Recency_of_POS_TXN','Recency_of_MB_TXN',
             'Recency_of_Activity','I_AQB_PrevQ1','I_AQB_PrevQ2','I_CR_AQB_PrevQ1','I_CR_AQB_PrevQ2','I_CNR_PrevQ1',
             'I_CNR_PrevQ2','I_NRV_PrevQ1','I_NRV_PrevQ2','CR_AMB_Drop_Build_1','CR_AMB_Drop_Build_2','CR_AMB_Drop_Build_3',
             'CR_AMB_Drop_Build_4','CR_AMB_Drop_Build_5','Req_Logged_PrevQ1','Req_Resolved_PrevQ1','Query_Logged_PrevQ1',
             'Query_Resolved_PrevQ1','Complaint_Logged_PrevQ1','Complaint_Resolved_PrevQ1','NO_OF_CHEQUE_BOUNCE_V1',
             'Percent_Change_in_Credits','Percent_Change_in_FT_Bank','Percent_Change_in_FT_outside','Percent_Change_in_Self_Txn',
             'Percent_Change_in_Big_Expenses']
dummy_vars = ['HNW_CATEGORY','FINAL_WORTH_prev1','EMAIL_UNSUBSCRIBE','ENGAGEMENT_TAG_prev1','FRX_PrevQ1',
              'EFT_SELF_TRANSFER_PrevQ1','Billpay_Active_PrevQ1','AGRI_PREM_CLOSED_PREVQ1','AL_CNC_PREM_CLOSED_PREVQ1',
              'AL_PREM_CLOSED_PREVQ1','BL_PREM_CLOSED_PREVQ1','CC_PREM_CLOSED_PREVQ1','CE_PREM_CLOSED_PREVQ1',
              'CV_PREM_CLOSED_PREVQ1','EDU_PREM_CLOSED_PREVQ1','OTHER_LOANS_PREM_CLOSED_PREVQ1','PL_PREM_CLOSED_PREVQ1',
              'RD_PREM_CLOSED_PREVQ1','FD_PREM_CLOSED_PREVQ1','TL_PREM_CLOSED_PREVQ1','TWL_PREM_CLOSED_PREVQ1',
              'AGRI_Closed_PrevQ1','AL_CNC_Closed_PrevQ1','AL_Closed_PrevQ1','BL_Closed_PrevQ1','CC_CLOSED_PREVQ1',
              'CE_Closed_PrevQ1','CV_Closed_PrevQ1','EDU_Closed_PrevQ1','GL_Closed_PrevQ1','OTHER_LOANS_Closed_PrevQ1',
              'PL_Closed_PrevQ1','RD_CLOSED_PREVQ1','FD_CLOSED_PREVQ1','TL_Closed_PrevQ1','TWL_Closed_PrevQ1',
              'DEMAT_CLOSED_PREV1YR','SEC_ACC_CLOSED_PREV1YR','AGRI_TAG_LIVE','AL_CNC_TAG_LIVE','AL_TAG_LIVE','BL_TAG_LIVE',
              'CC_TAG_LIVE','CE_TAG_LIVE','CV_TAG_LIVE','DEMAT_TAG_LIVE','EDU_TAG_LIVE','GL_TAG_LIVE','HL_TAG_LIVE',
              'SEC_ACC_TAG_LIVE','INS_TAG_LIVE','LAS_TAG_LIVE','MF_TAG_LIVE','OTHER_LOANS_TAG_LIVE','PL_TAG_LIVE','RD_TAG_LIVE',
              'FD_TAG_LIVE','TL_TAG_LIVE','TWL_TAG_LIVE','lap_tag_live','Billpay_Active_PrevQ1_N','Billpay_Reg_ason_Prev1_N',
              'Charges_cnt_PrevQ1_N','FRX_PrevQ1_N','RBI_Class_Audit','gender_bin']
# creating a ordered dict for embedding variables -> (varaible name : 'K'-d vector)
emb_vars_dict = OrderedDict([('OCCUP_ALL_NEW',5),('city',5),('zip',5),('brn_code',5)])
emb_vars = [evar for (evar, esize) in emb_vars_dict.items()]
# defining target/outcome variable
target_var = ['Responders']

In [14]:
# some columns have '>' printed instead of '>9'. we set those values to 10
for i in cont_vars:
    if df[i].dtype == 'O':
        df[i] = np.where(df[i] == '>','10',df[i])
    else:
        df[i] = df[i]

In [15]:
for i in cont_vars:
    df[i] = pd.to_numeric(df[i])

In [16]:
# replacing NA values by 'N' for character type columns and 0 for numeric type columns
df[cont_vars] = df[cont_vars].fillna(0)
'''
for i in dummy_vars:
    if df[i].dtype == 'O':
        df[i] = df[i].fillna('N')
    else:
        df[i] = df[i].fillna(0)
'''

"\nfor i in dummy_vars:\n    if df[i].dtype == 'O':\n        df[i] = df[i].fillna('N')\n    else:\n        df[i] = df[i].fillna(0)\n"

In [36]:
if Eliminate_Outlier == True:
    def outlier_elim(x,limit):
        if x > limit: 
            return limit
        else: 
            return x
    # setting a upper cutoff at Mean + (2 * std.dev)
    for i in cont_vars:
        upper_limit = df[i].mean() + 2*df[i].std()
        df.loc[:,i] = df.loc[:,i].apply(outlier_elim)

TypeError: outlier_elim() missing 1 required positional argument: 'upper_limit'

In [18]:
# function for applying label encoding to variables
def labelencoder(input_df,var_list):
    le = LabelEncoder()
    for var in var_list:
        try:
            input_df[var] = le.fit_transform(input_df[var])
        except:
            input_df[var] = input_df[var].astype(str)
            input_df[var] = le.fit_transform(input_df[var])
    return input_df

In [None]:
if Production == True:
    # subsetting df to contain only relevant variables
    df = df[cont_vars+dummy_vars+emb_vars+target_var+['DataType']]
else:
    # subsetting df to contain only relevant variables
    df = df[cont_vars+dummy_vars+emb_vars+target_var]

In [None]:
# applying 'labelencoder' function for emb_vars
df = labelencoder(df,emb_vars+target_var)
# applying one hot encoding to categorical variables
dummy_df = pd.get_dummies(df[dummy_vars],drop_first = True)
#concatenating dummy_df to master_df
df = (pd.concat([df.reset_index(drop = True),dummy_df.reset_index(drop = True)],axis = 1))
#dropping 'non one hot encoded' dummy variable columns
df = df.drop(dummy_vars, 1)

In [None]:
# scaling continous variables
scaler = StandardScaler()
df[cont_vars] = scaler.fit_transform(df.loc[:,cont_vars])

In [None]:
if Production == True:
    # resplitting train & test dfs
    train_df = df[df.DataType == 'Train']
    test_df = df[df.DataType == 'Test']
    # dropping 'DataType' column
    train_df = train_df.drop('DataType', 1)
    test_df = test_df.drop('DataType', 1)
    train_df = shuffle(train_df, random_state=666)
    del df
else:
    # splitting train & test dfs
    train_df,test_df = train_test_split(df,test_size = 0.3,random_state = 666)
    del df

In [None]:
if Rebalance == True:
    # upsampling train_df
    ros = RandomOverSampler(random_state = 666)
    train_dftmp,y = ros.fit_sample(train_df,train_df[target_var]) 
    train_dftmp = pd.DataFrame(train_dftmp)
    train_dftmp.columns = train_df.columns
    train_df = train_dftmp
    train_df = shuffle(train_df, random_state=666)
    del train_dftmp

In [None]:
# dummy variables
train_dummy_df = train_df.iloc[:,int(len(cont_vars))+int(len(emb_vars))+1:train_df.shape[1]]
test_dummy_df = test_df.iloc[:,int(len(cont_vars))+int(len(emb_vars))+1:test_df.shape[1]]

In [None]:
# continous variables
train_cont_df = train_df[cont_vars]
test_cont_df = test_df[cont_vars]

In [None]:
# embedded variables
train_x_emb = train_df[emb_vars]
test_x_emb = test_df[emb_vars]

In [None]:
# target variable
y = pd.get_dummies(train_df[target_var],drop_first = True)

In [None]:
# combine continous and dummy variables to a single numpy array
train_x_cont_dummy = (pd.concat([train_dummy_df.reset_index(drop = True),train_cont_df.reset_index(drop = True)],axis=1)).values
test_x_cont_dummy = (pd.concat([test_dummy_df.reset_index(drop = True),test_cont_df.reset_index(drop = True)],axis=1)).values
# creating the final train files for target variable
train_y = train_df[target_var].values
# creating the final train and test files for embedded variable
train_x_emb = train_x_emb.values
test_x_emb = test_x_emb.values

In [None]:
# function for creating hidden embeddeding layers for variables
def EmbeddingCreator (emb_var_dict,input_df):
    id_list = ['encoder_' + var for (var, size) in emb_var_dict.items()] #adding 'encoder_' to each variable name
    embeddingvars = {k: Sequential() for k in id_list} #creating a sequential layer for each variable
    for embvar,nn_obj in embeddingvars.items():
        colname = embvar.replace('encoder_', '') #removing 'encoder_' from each variable name
        nn_obj.add(Embedding(len(input_df[colname].unique()),emb_var_dict[colname],input_length = 1,
                             embeddings_regularizer = l2(1e-2))) #adding embedding layer for each variable
        nn_obj.add(Flatten()) #compressing layer to 1-D plane
    embedding_vars = [embeddingvars[embvar] for embvar in id_list] #extracting NN model objects
    return embedding_vars

In [None]:
# applying 'EmbeddingCreator' function for creating hidden embeddeding layers for variables
df = (pd.concat([train_df.reset_index(drop = True),test_df.reset_index(drop = True)],axis=0))
emb_layers = EmbeddingCreator(emb_vars_dict,df)

In [None]:
# defining fully connected layers
# input layer for continous and binary variables
dense_x = Sequential()
dense_x.add(Dense(250, input_dim=train_x_cont_dummy.shape[1]))
all_layers = emb_layers+[dense_x]
# input layer for embedded variables
model_emb = Sequential()
model_emb.add(Merge(all_layers,mode='concat'))
model_emb.add(Dense(units=180))
model_emb.add(Dropout(0.30))
model_emb.add(BatchNormalization())
model_emb.add(Activation('relu'))
model_emb.add(Dense(units=100))
model_emb.add(Dropout(0.25))
model_emb.add(BatchNormalization())
model_emb.add(Activation('linear'))
model_emb.add(Dense(units=50))
model_emb.add(Dropout(0.15))
model_emb.add(BatchNormalization())
model_emb.add(Activation('relu'))
model_emb.add(Dense(units=50))
model_emb.add(Dropout(0.15))
model_emb.add(BatchNormalization())
model_emb.add(Activation('relu'))
model_emb.add(Dense(units=1))
model_emb.add(Activation('sigmoid'))
model_emb.compile(optimizer='adagrad',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
# function to combine input features into a list
def comb_emb_cont_dummy(emb_dict,x_emb,x_cont_dummy):
    x_all = []
    for i in range(len(emb_dict)):
        x_all.append(x_emb[:,i])
    x_all.append(x_cont_dummy)
    return x_all

train_x_all = comb_emb_cont_dummy(emb_vars_dict,train_x_emb,train_x_cont_dummy)
test_x_all = comb_emb_cont_dummy(emb_vars_dict,test_x_emb,test_x_cont_dummy)

# defining class weights
class_weight = {0: 1.0,1: 1.0}

# fitting the model
model_emb.fit(train_x_all,train_y,epochs = 25,validation_split = 0.3,batch_size = 64,shuffle = True,class_weight = class_weight)

In [None]:
if cross_train == True:
    for i in range(1,20):
        if i % 2 == 0:
            class_weight = {0: 1.0,1: 2.0}
        else:
            class_weight = {0: 2.0,1: 1.0}
        model_emb.fit(train_x_all,train_y,epochs = 1,validation_split = 0.3,batch_size = 64,shuffle = True,
                      class_weight = class_weight)

In [None]:
# getting the predictions
predicted_y_prob = model_emb.predict_proba(test_x_all) #probability scores
if Production == False:
    test_y = test_df[target_var]
    # concatinating predicted scores and true labels
    predict_df = (pd.concat([pd.DataFrame(predicted_y_prob).reset_index(drop=True),
                             pd.DataFrame(test_y).reset_index(drop=True)], axis=1))
    predict_df.columns = ['Predicted_Probability','True_Class']
    predict_df.True_Class = predict_df.True_Class.astype(int)
else:
    predict_df = (pd.concat([pd.DataFrame(test_ids).reset_index(drop=True),
                             pd.DataFrame(predicted_y_prob).reset_index(drop=True)], axis=1))
    predict_df.columns = ['UCIC_ID','Responders']
    predict_df.to_csv('C:\\Users\\admin\\Downloads\\New folder\\output.csv', sep='\t')

In [None]:
if Production == False:
    # True 0 - Probability distribution
    predict_df_TrueLost = predict_df[predict_df['True_Class'] == 0]
    sns.distplot(predict_df_TrueLost.iloc[:,0])

In [None]:
if Production == False:    
    # True 1 - Probability distribution
    predict_df_TrueWon = predict_df[predict_df['True_Class'] == 1]
    sns.distplot(predict_df_TrueWon.iloc[:,0])

In [None]:
def liftcalculater(predict_df):
    predict_df = predict_df.sort_values('Predicted_Probability',ascending = False)
    predict_df = predict_df.reset_index(drop=True)
    predict_df_top2decile = predict_df.iloc[:int(predict_df.shape[0]/5),:]
    # finding % of responders in predict_df_top2decile
    Total_Responders = sum(predict_df.True_Class)
    FirstDecile_Responders = sum(predict_df_top2decile.True_Class)
    Lift = FirstDecile_Responders/Total_Responders
    return Lift

In [None]:
if Production == False:
    lift = liftcalculater(predict_df)
    print(lift)

In [None]:
print('Code run completed')