In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport
from datetime import date
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import OneHotEncoder

In [2]:
data_train = pd.read_csv("train_month_3_with_target.csv")
data_train_1 = pd.read_csv("train_month_1.csv")
data_train_2 = pd.read_csv("train_month_2.csv")
train_1 = pd.merge(data_train, data_train_1, on="client_id", suffixes=("", "_1"))
train=pd.merge(train_1, data_train_2, on="client_id", suffixes=("", "_2"))


In [3]:
data_test = pd.read_csv("test_month_3.csv")
data_test_1 = pd.read_csv("test_month_1.csv")
data_test_2 = pd.read_csv("test_month_2.csv")
test_1 = pd.merge(data_test, data_test_1, on="client_id", suffixes=("", "_1"))
test=pd.merge(test_1, data_test_2, on="client_id", suffixes=("", "_2"))

In [4]:
# Split the data in the training (70%) and testing set (30%)
x_train, x_test, y_train, y_test = train_test_split(train.drop(columns="target"), train['target'], test_size = .3, random_state = 10)

In [5]:
def cleaning_tr (data_x, data_y):
    data_x.drop(columns=["customer_since_all_1", "customer_since_all_2", 
                        "customer_since_bank_1", "customer_since_bank_2", 
                        "customer_gender_1", "customer_gender_2",
                        "customer_birth_date_1", "customer_birth_date_2", 
                        "customer_postal_code_1", "customer_postal_code_2", 
                        "customer_occupation_code_1", "customer_occupation_code_2",
                        "customer_education_1", "customer_education_2", 
                       ], inplace=True)
    for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
        data_x[column]=data_x[column]+"-01"
        data_x[column]=pd.to_datetime(data_x[column], infer_datetime_format=True)
    t=data_x.index
    l=data_x.shape[0]
    for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
        a={'today':date.today()}
        x=pd.DataFrame(a, index=[0])
        y=pd.concat([x]*l, ignore_index=True)
        y.index=t
        z=pd.to_datetime(y["today"], infer_datetime_format=True)
        data_x[column]= (z-data_x[column])/np.timedelta64(1,'M')
    data_x["customer_birth_date"]=data_x["customer_birth_date"]/12
    for column in ["customer_since_all", "customer_since_bank", 
               "customer_occupation_code", "customer_education", 
               "customer_children", "customer_relationship"]:
        data_x[column+"_is_na"]=data_x[column].isna().apply(lambda x: 0 if x==0 else 1)
    data_x["target"]=data_y
    
    data_x["customer_relationship"].fillna(value="unknown", inplace=True)
    data_x["customer_relationship_1"].fillna(value="unknown", inplace=True)
    data_x["customer_relationship_2"].fillna(value="unknown", inplace=True)
    
    f = lambda x: x.median() if np.issubdtype(x.dtype, np.number) else x.mode().iloc[0]
    data_x = data_x.fillna(data_x.groupby('target').transform(f))

    data_x=data_x.drop(columns=["target","client_id"])
    
    region_dictionary = {'no': 0, 'onebaby' : 2, 'preschool':3, 'young':5, 'adolescent':6, 'grownup':7, 'mature':8, 'yes': 4}
    data_x['customer_children'] = data_x['customer_children'].apply(lambda x: region_dictionary[x])
    data_x['customer_children_1'] = data_x['customer_children_1'].apply(lambda x: region_dictionary[x])
    data_x['customer_children_2'] = data_x['customer_children_2'].apply(lambda x: region_dictionary[x])
    
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship']]).toarray())
    encoder_df.columns=["rel_a", "rel_b", "rel_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship_1']]).toarray())
    encoder_df.columns=["rel1_a", "rel1_b", "rel1_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship_2']]).toarray())
    encoder_df.columns=["rel2_a", "rel2_b", "rel2_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)

    data_x=data_x.drop(columns=['customer_relationship', 'customer_relationship_1', 'customer_relationship_2'])
    
    return data_x


In [6]:
def cleaning_tes (data_x, data_y):
    data_x.drop(columns=["customer_since_all_1", "customer_since_all_2", 
                        "customer_since_bank_1", "customer_since_bank_2", 
                        "customer_gender_1", "customer_gender_2",
                        "customer_birth_date_1", "customer_birth_date_2", 
                        "customer_postal_code_1", "customer_postal_code_2", 
                        "customer_occupation_code_1", "customer_occupation_code_2",
                        "customer_education_1", "customer_education_2", 
                       ], inplace=True)
    for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
        data_x[column]=data_x[column]+"-01"
        data_x[column]=pd.to_datetime(data_x[column], infer_datetime_format=True)
    t=data_x.index
    l=data_x.shape[0]
    for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
        a={'today':date.today()}
        x=pd.DataFrame(a, index=[0])
        y=pd.concat([x]*l, ignore_index=True)
        y.index=t
        z=pd.to_datetime(y["today"], infer_datetime_format=True)
        data_x[column]= (z-data_x[column])/np.timedelta64(1,'M')
    data_x["customer_birth_date"]=data_x["customer_birth_date"]/12
    for column in ["customer_since_all", "customer_since_bank", 
               "customer_occupation_code", "customer_education", 
               "customer_children", "customer_relationship"]:
        data_x[column+"_is_na"]=data_x[column].isna().apply(lambda x: 0 if x==0 else 1)
        
    x_train["target"]=y_train
    
    data_x["customer_relationship"].fillna(value="unknown", inplace=True)
    data_x["customer_relationship_1"].fillna(value="unknown", inplace=True)
    data_x["customer_relationship_2"].fillna(value="unknown", inplace=True)
    
    f = lambda x: x.median() if np.issubdtype(x.dtype, np.number) else x.mode().iloc[0]
    data_x = data_x.fillna(x_train.groupby('target').transform(f))

    data_x=data_x.drop(columns=["client_id"])
    
    region_dictionary = {'no': 0, 'onebaby' : 2, 'preschool':3, 'young':5, 'adolescent':6, 'grownup':7, 'mature':8, 'yes': 4}
    data_x['customer_children'] = data_x['customer_children'].apply(lambda x: region_dictionary[x])
    data_x['customer_children_1'] = data_x['customer_children_1'].apply(lambda x: region_dictionary[x])
    data_x['customer_children_2'] = data_x['customer_children_2'].apply(lambda x: region_dictionary[x])
    
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship']]).toarray())
    encoder_df.columns=["rel_a", "rel_b", "rel_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship_1']]).toarray())
    encoder_df.columns=["rel1_a", "rel1_b", "rel1_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)
    encoder_df = pd.DataFrame(encoder.fit_transform(data_x[['customer_relationship_2']]).toarray())
    encoder_df.columns=["rel2_a", "rel2_b", "rel2_c"]
    encoder_df.index=data_x.index
    data_x=data_x.join(encoder_df)

    data_x=data_x.drop(columns=['customer_relationship', 'customer_relationship_1', 'customer_relationship_2'])
    
    return data_x

In [7]:
x_tr=cleaning_tr(x_train, y_train)

In [8]:
x_te=cleaning_tes(x_test, y_test)

KeyError: nan

In [None]:
# easy ensemble for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import EasyEnsembleClassifier
# define model
model = EasyEnsembleClassifier(n_estimators=100)
model.fit(x_tr, y_train)
y_pred = model.predict(x_te)
print(confusion_matrix(y_test, y_pred))

In [None]:
# random forest for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
# define model
model = RandomForestClassifier(n_estimators=100)
model.fit(x_tr, y_train)
y_pred = model.predict(x_te)
print(confusion_matrix(y_test, y_pred))

In [None]:
sum(np.sum(train.isna(), axis=0)+np.sum(test.isna(), axis=0)>0)
#there is no problem of appearning NAs in other sets

In [None]:
# Find highly correlated variables. We could exclude those variables once we start building the model or we can use
#LASSO or elastic net so that these ones are removed, we will see
threshold = 0.3

def high_cor_function(df):
    cor = df.corr()
    corrm = np.corrcoef(df.transpose())
    corr = corrm - np.diagflat(corrm.diagonal())
    print("max corr:",corr.max(), ", min corr: ", corr.min())
    c1 = cor.stack().sort_values(ascending=False).drop_duplicates()
    high_cor = c1[c1.values!=1]    
    thresh = threshold 
    display(high_cor[high_cor<thresh])
  
high_cor_function(x_train.select_dtypes(exclude=['object']))

In [None]:
profile = ProfileReport(data_train, title="Profiling Report")

In [None]:
profile.to_file("Profiling.html")

In [None]:
# homebanking_active and has_homebanking are closely related 
# has 5 types of insurances 
# has 2 types of  personal loans 
# has 5 types of accounts (current, savings, pension, 2 starter ones)
# balances of 5 types of insurances 
# outstanding balances of 2 types of  personal loans 
# balances on 5 types of accounts (current, savings, pension, 2 starter ones)
# number of branches /and areas visited in the past month
# 2 types customer since (2 NAs)
# gender/ birthday / occupation (coded) (NAs)/ self employed
# education level (NAs)/ children (NAs) / relationship (NAs)


In [None]:
# Note that in the training set we are only given the datapoints with the stable account for (-3,+3) 
# but the test set contains all types of data

In [None]:
print(x_train.columns)

In [None]:
from itertools import combinations

[(i, j) for i,j in combinations(x_train, 2) if x_train[i].equals(x_train[j])]

In [None]:
print(x_train.shape)
print(x_train.columns)

In [None]:
for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
    x_train[column]=x_train[column]+"-01"
    x_train[column]=pd.to_datetime(x_train[column], infer_datetime_format=True)

In [None]:
from datetime import date
from dateutil.relativedelta import relativedelta
t=x_train.index
l=x_train.shape[0]
for column in ["customer_since_all", "customer_since_bank", "customer_birth_date"]:
    a={'today':date.today()}
    x=pd.DataFrame(a, index=[0])
    y=pd.concat([x]*l, ignore_index=True)
    y.index=t
    z=pd.to_datetime(y["today"], infer_datetime_format=True)
    x_train[column]= (z-x_train[column])/np.timedelta64(1,'M')
x_train["customer_birth_date"]=x_train["customer_birth_date"]/12

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 
plt.scatter(x_train["customer_birth_date"], y_train)

In [None]:
#for relationship lets take a third category (so these will be nominal)
#for children take the mode in the class
#for education take the mean
#for occupation code take the mode in the class
# for both customers take the mean in the class

In [None]:
for column in ["customer_since_all", "customer_since_bank", 
               "customer_occupation_code", "customer_education", 
               "customer_children", "customer_relationship"]:
    x_train[column+"_is_na"]=x_train[column].isna().apply(lambda x: 0 if x==0 else 1)
x_train.shape

In [None]:
x_train["target"]=y_train
f = lambda x: x.median() if np.issubdtype(x.dtype, np.number) else x.mode().iloc[0]
x_train = x_train.fillna(x_train.groupby('target').transform(f))

x_train["customer_relationship"].fillna(value="unknown", inplace=True)
x_train["customer_relationship_1"].fillna(value="unknown", inplace=True)
x_train["customer_relationship_2"].fillna(value="unknown", inplace=True)

x_train=x_train.drop(columns="target")