In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport
from datetime import date
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Merging training data from current and 2 months earlier
data_train = pd.read_csv("train_month_3_with_target.csv")
data_train_1 = pd.read_csv("train_month_1.csv")
data_train_2 = pd.read_csv("train_month_2.csv")
train_1 = pd.merge(data_train, data_train_1, on="client_id", suffixes=("", "_1"))
train=pd.merge(train_1, data_train_2, on="client_id", suffixes=("", "_2"))


In [3]:
# Merging test data from current and 2 months earlier
data_test = pd.read_csv("test_month_3.csv")
data_test_1 = pd.read_csv("test_month_1.csv")
data_test_2 = pd.read_csv("test_month_2.csv")
test_1 = pd.merge(data_test, data_test_1, on="client_id", suffixes=("", "_1"))
test=pd.merge(test_1, data_test_2, on="client_id", suffixes=("", "_2"))

In [4]:
# homebanking_active and has_homebanking are closely related 
# has 5 types of insurances 
# has 2 types of  personal loans 
# has 5 types of accounts (current, savings, pension, 2 starter ones)
# balances of 5 types of insurances 
# outstanding balances of 2 types of  personal loans 
# balances on 5 types of accounts (current, savings, pension, 2 starter ones)
# number of branches /and areas visited in the past month
# 2 types customer since (2 NAs)
# gender/ birthday / occupation (coded) (NAs)/ self employed
# education level (NAs)/ children (NAs) / relationship (NAs)


In [5]:
#for relationship lets take a third category (so these will be nominal)
#for children take the mode in the class
#for education take the mean
#for occupation code take the mode in the class
#for both customers take the mean in the class

In [6]:
# This will help to identify columns with the same values
###
#from itertools import combinations
#
#[(i, j) for i,j in combinations(x_train, 2) if x_train[i].equals(x_train[j])]
###

In [7]:
# Split the data in the training (70%) and validation set (30%)
x_train, x_valid, y_train, y_valid = train_test_split(train.drop(columns="target"), train['target'], test_size = .3, random_state = 10)

In [8]:
# This function drops the copy columns that have the same values, changes the date to time passed (months for 
# customer since, and years for age of the customer), then we also add new NA indicator columns, also 
# target is added in the end
def cleaning_tr (data_x, data_y):
    #DROPPING COPIES
    data_x.drop(columns=["customer_since_all_1", "customer_since_all_2", 
                        "customer_since_bank_1", "customer_since_bank_2", 
                        "customer_gender_1", "customer_gender_2",
                        "customer_birth_date_1", "customer_birth_date_2", 
                        "customer_postal_code_1", "customer_postal_code_2", 
                        "customer_occupation_code_1", "customer_occupation_code_2",
                        "customer_education_1", "customer_education_2", 
                       ], inplace=True)
    #DATE FORMATTING
    for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
        data_x[column]=data_x[column]+"-01"
        data_x[column]=pd.to_datetime(data_x[column], infer_datetime_format=True)
    t=data_x.index
    l=data_x.shape[0]
    for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
        a={'today':date.today()}
        x=pd.DataFrame(a, index=[0])
        y=pd.concat([x]*l, ignore_index=True)
        y.index=t
        z=pd.to_datetime(y["today"], infer_datetime_format=True)
        data_x[column]= (z-data_x[column])/np.timedelta64(1,'M')
    data_x["customer_birth_date"]=data_x["customer_birth_date"]/12
    #MAKING IS NA COLUMN
    for column in ["customer_since_all", "customer_since_bank", 
               "customer_occupation_code", "customer_education", 
               "customer_children", "customer_relationship", 
               "customer_children_1", "customer_relationship_1",
               "customer_children_2", "customer_relationship_2"]:
        data_x[column+"_is_na"]=data_x[column].isna().apply(lambda x: 0 if x==0 else 1)
        
    data_x["target"]=data_y
    
    return data_x


In [9]:
x_train_clean=cleaning_tr (x_train, y_train)


In [12]:
# This function makes new data out of clean data where we work on customer_relationship and customer_children
# we then drop target and client_id
def nahandle_tr(data_x):
    #FILLING IN NA COLUMNS
    import copy
    #we need deepcopy as we need the input of the function at a later point
    new=copy.deepcopy(data_x)

    #customer_relationship NA is given its own level
    new["customer_relationship"].fillna(value="unknown", inplace=True)
    new["customer_relationship_1"].fillna(value="unknown", inplace=True)
    new["customer_relationship_2"].fillna(value="unknown", inplace=True)
    
    f = lambda x: x.mean() if np.issubdtype(x.dtype, np.number) else x.mode().iloc[0]
    new = new.fillna(new.groupby('target').transform(f))
    
    #FINAL TOUCH
    new=new.drop(columns=["target","client_id"])
    
    return new
    

In [13]:
x_train_handled=nahandle_tr(x_train_clean)

In [14]:
# now we only havw to encode customer relationships and drop the original columns
def encoding(data_x):
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship']]).toarray())
    encoder_df.columns=["rel_a", "rel_b", "rel_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship_1']]).toarray())
    encoder_df.columns=["rel1_a", "rel1_b", "rel1_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship_2']]).toarray())
    encoder_df.columns=["rel2_a", "rel2_b", "rel2_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    
    #customer_childer are given numbers indicating how much care needed for kids
    region_dictionary = {'no': 0, 'onebaby' : 1, 'preschool':2, 'young':4, 'adolescent':5, 'grownup':6, 'mature':7, 'yes': 3}
    data_x['customer_children'] = data_x['customer_children'].apply(lambda x: region_dictionary[x])
    data_x['customer_children_1'] = data_x['customer_children_1'].apply(lambda x: region_dictionary[x])
    data_x['customer_children_2'] = data_x['customer_children_2'].apply(lambda x: region_dictionary[x])
    
    #FINAL TOUCH
    data_x=data_x.drop(columns=['customer_relationship', 'customer_relationship_1', 'customer_relationship_2'])
    
    return data_x


In [15]:
x_train=encoding(x_train_handled)

In [16]:
val=pd.DataFrame(columns=["customer_children", "customer_children_1", "customer_children_2",
                          "customer_relationship", "customer_relationship_1", "customer_relationship_2",
                          "customer_since_all", "customer_since_bank", 
                          "customer_occupation_code", "customer_education"], index=[0,1])

for col in val.columns:
    val[col][0]=x_train_handled[col][x_train_handled[(x_train_clean[col].isna()) & (x_train_clean['target']==0)].index[0]]
    val[col][1]=x_train_handled[col][x_train_handled[(x_train_clean[col].isna()) & (x_train_clean['target']==1)].index[0]]



## At this point we have x_train that has every column as numeric and without the target AND the values we need to impute, we will now proceed to cleaning/na_handling for the test/validation set


In [17]:
# This function drops the copy columns that have the same values, changes the date to time passed (months for 
# customer since, and years for age of the customer), then we also add new NA indicator columns
def cleaning_val (data):
    import copy
    data_x=copy.deepcopy(data)
    #DROPPING COPIES
    data_x.drop(columns=["customer_since_all_1", "customer_since_all_2", 
                        "customer_since_bank_1", "customer_since_bank_2", 
                        "customer_gender_1", "customer_gender_2",
                        "customer_birth_date_1", "customer_birth_date_2", 
                        "customer_postal_code_1", "customer_postal_code_2", 
                        "customer_occupation_code_1", "customer_occupation_code_2",
                        "customer_education_1", "customer_education_2", 
                       ], inplace=True)
    #DATE FORMATTING
    for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
        data_x[column]=data_x[column]+"-01"
        data_x[column]=pd.to_datetime(data_x[column], infer_datetime_format=True)
    t=data_x.index
    l=data_x.shape[0]
    for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
        a={'today':date.today()}
        x=pd.DataFrame(a, index=[0])
        y=pd.concat([x]*l, ignore_index=True)
        y.index=t
        z=pd.to_datetime(y["today"], infer_datetime_format=True)
        data_x[column]= (z-data_x[column])/np.timedelta64(1,'M')
    data_x["customer_birth_date"]=data_x["customer_birth_date"]/12
    #MAKING IS NA COLUMN
    for column in ["customer_since_all", "customer_since_bank", 
               "customer_occupation_code", "customer_education", 
               "customer_children", "customer_relationship", 
               "customer_children_1", "customer_relationship_1",
               "customer_children_2", "customer_relationship_2"]:
        data_x[column+"_is_na"]=data_x[column].isna().apply(lambda x: 0 if x==0 else 1)
    
    return data_x



In [18]:
# This function makes new data out of clean data where we work on customer_relationship and customer_children
# we then drop target and client_id
def nahandle_val(data):
    
    
    
    
    import copy
    data_x=copy.deepcopy(data)
    
    data_x=data_x.drop(columns=["client_id"])
    #FILLING IN NA COLUMNS
    
    for col in val.columns:
        #get dtype for column
        dt = data_x[col].dtype 
        #check if it is a number
        if dt == np.int64 or dt==np.float64:
            data_x[col].fillna(val[col].mean(), inplace=True)
        else:
            data_x[col].fillna(val[col].mode().iloc[0], inplace=True)
    

    
    return data_x

In [19]:
# now we only havw to encode customer relationships and drop the original columns
def encoding(data):
    import copy
    data_x=copy.deepcopy(data)
    
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship']]).toarray())
    encoder_df.columns=["rel_a", "rel_b", "rel_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship_1']]).toarray())
    encoder_df.columns=["rel1_a", "rel1_b", "rel1_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship_2']]).toarray())
    encoder_df.columns=["rel2_a", "rel2_b", "rel2_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    
    #customer_childer are given numbers indicating how much care needed for kids
    region_dictionary = {'no': 0, 'onebaby' : 1, 'preschool':2, 'young':4, 'adolescent':5, 'grownup':6, 'mature':7, 'yes': 3}
    data_x['customer_children'] = data_x['customer_children'].apply(lambda x: region_dictionary[x])
    data_x['customer_children_1'] = data_x['customer_children_1'].apply(lambda x: region_dictionary[x])
    data_x['customer_children_2'] = data_x['customer_children_2'].apply(lambda x: region_dictionary[x])
    
    #FINAL TOUCH
    data_x=data_x.drop(columns=['customer_relationship', 'customer_relationship_1', 'customer_relationship_2'])
    
    return data_x



In [20]:
clean=cleaning_val(x_valid)

In [21]:
handled=nahandle_val(clean)

In [22]:
x_valid=encoding(handled)

## Now we have the data for val/test it is time to run some tree based models

In [29]:
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix

In [30]:
# bagged decision trees with random undersampling for imbalanced classification
from imblearn.ensemble import BalancedBaggingClassifier
# define model
model = BalancedBaggingClassifier(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print(confusion_matrix(y_valid, y_pred))


[[15917  2603]
 [  360   230]]


In [31]:
# random forest for imbalanced classification
from sklearn.ensemble import RandomForestClassifier
# define model
model = RandomForestClassifier(n_estimators=50, class_weight='balanced')
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print(confusion_matrix(y_valid, y_pred))

[[18520     0]
 [  590     0]]


In [32]:
# bootstrap class balanced random forest for imbalanced classification
from sklearn.ensemble import RandomForestClassifier
# define model
model = RandomForestClassifier(n_estimators=50, class_weight='balanced')
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print(confusion_matrix(y_valid, y_pred))

[[18520     0]
 [  590     0]]


In [33]:
# random forest with random undersampling for imbalanced classification
from imblearn.ensemble import BalancedRandomForestClassifier
# define model
model = BalancedRandomForestClassifier(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print(confusion_matrix(y_valid, y_pred))

[[14601  3919]
 [  288   302]]


In [34]:
# easy ensemble for imbalanced classification
from imblearn.ensemble import EasyEnsembleClassifier
# define model
model = EasyEnsembleClassifier(n_estimators=20)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print(confusion_matrix(y_valid, y_pred))

[[14444  4076]
 [  292   298]]
