In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats
from scipy.stats import boxcox
pd.options.mode.chained_assignment = None

In [2]:
#Read in Data
train = pd.read_csv('dataFiles/application_train.csv')
test = pd.read_csv('dataFiles/application_test.csv')
bureau_data = pd.read_csv('dataFiles/bureau.csv')
bureau_balance_data = pd.read_csv('dataFiles/bureau_balance.csv')
prev_app_data = pd.read_csv('dataFiles/previous_application.csv')
pos_cash_balance_data = pd.read_csv('dataFiles/POS_CASH_balance.csv')
installments_data = pd.read_csv('dataFiles/installments_payments.csv')
cc_data = pd.read_csv('dataFiles/credit_card_balance.csv')

## Application Data

In [3]:
train_num = train.select_dtypes(exclude='object')
test_num = test.select_dtypes(exclude='object')

In [4]:
#Log Transformations
train_num[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY',\
       'AMT_GOODS_PRICE']] = np.log(train_num[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE']])

test_num[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY',\
       'AMT_GOODS_PRICE']] = np.log(test_num[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE']])

In [5]:
#Clipping Outliers
train_num['CNT_CHILDREN'] = train_num['CNT_CHILDREN'].apply(lambda x: 5 if x >=5 else x)
train_num['CNT_FAM_MEMBERS'] = train_num['CNT_FAM_MEMBERS'].apply(lambda x: 5 if x >=5 else x)
train_num['AMT_INCOME_TOTAL'] = train_num['AMT_INCOME_TOTAL'].apply(lambda x: 14 if x >=14 else x)
train_num['HOUR_APPR_PROCESS_START'] = train_num['HOUR_APPR_PROCESS_START'].apply(lambda x: 2 if x <= 2 else x)
train_num['COMMONAREA_AVG'] = train_num['COMMONAREA_AVG'].fillna(0).apply(lambda x: 0.3 if x >= 0.3 else x)
train_num['AMT_REQ_CREDIT_BUREAU_YEAR'] = train_num['AMT_REQ_CREDIT_BUREAU_YEAR'].apply(lambda x: 9 if x >= 9 else x).fillna(10)

test_num['CNT_CHILDREN'] = test_num['CNT_CHILDREN'].apply(lambda x: 5 if x >=5 else x)
test_num['CNT_FAM_MEMBERS'] = test_num['CNT_FAM_MEMBERS'].apply(lambda x: 5 if x >=5 else x)
test_num['AMT_INCOME_TOTAL'] = test_num['AMT_INCOME_TOTAL'].apply(lambda x: 14 if x >=14 else x)
test_num['HOUR_APPR_PROCESS_START'] = test_num['HOUR_APPR_PROCESS_START'].apply(lambda x: 2 if x <= 2 else x)
test_num['COMMONAREA_AVG'] = test_num['COMMONAREA_AVG'].fillna(0).apply(lambda x: 0.3 if x >= 0.3 else x)
test_num['AMT_REQ_CREDIT_BUREAU_YEAR'] = test_num['AMT_REQ_CREDIT_BUREAU_YEAR'].apply(lambda x: 9 if x >= 9 else x).fillna(10)

In [6]:
#BINNING THE TWO HUMPS FROM DAYS_EMPLOYED
train_num['DAYS_EMPLOYED_BIN_1'] = train_num['DAYS_EMPLOYED'].apply(lambda x: 1 if x < 150000 else 0)
train_num['DAYS_EMPLOYED_BIN_2'] = train_num['DAYS_EMPLOYED'].apply(lambda x: 1 if x >= 150000 else 0)

#BINNING THE TWO HUMPS FROM DAYS_EMPLOYED
test_num['DAYS_EMPLOYED_BIN_1'] = test_num['DAYS_EMPLOYED'].apply(lambda x: 1 if x < 150000 else 0)
test_num['DAYS_EMPLOYED_BIN_2'] = test_num['DAYS_EMPLOYED'].apply(lambda x: 1 if x >= 150000 else 0)

In [7]:
#ABS & BOX COX Transformation
train_num['DAYS_REGISTRATION'] = np.abs(train_num['DAYS_REGISTRATION'])
train_num['DAYS_REGISTRATION'] = train_num['DAYS_REGISTRATION'].apply(lambda x: 0.01 if x == 0 else x)
train_num['DAYS_REGISTRATION'] = boxcox(train_num['DAYS_REGISTRATION'],0.5)

train_num['DAYS_ID_PUBLISH'] = np.abs(train_num['DAYS_ID_PUBLISH'])
train_num['DAYS_ID_PUBLISH'] = train_num['DAYS_ID_PUBLISH'].apply(lambda x: 0.01 if x == 0 else x)
train_num['DAYS_ID_PUBLISH'] = boxcox(train_num['DAYS_ID_PUBLISH'],0.5)

train_num['ENTRANCES_AVG'] = train_num['ENTRANCES_AVG'].apply(lambda x: np.sqrt(x) if pd.notnull(x) else x)

test_num['DAYS_REGISTRATION'] = np.abs(test_num['DAYS_REGISTRATION'])
test_num['DAYS_REGISTRATION'] = test_num['DAYS_REGISTRATION'].apply(lambda x: 0.01 if x == 0 else x)
test_num['DAYS_REGISTRATION'] = boxcox(test_num['DAYS_REGISTRATION'],0.5)

test_num['DAYS_ID_PUBLISH'] = np.abs(test_num['DAYS_ID_PUBLISH'])
test_num['DAYS_ID_PUBLISH'] = test_num['DAYS_ID_PUBLISH'].apply(lambda x: 0.01 if x == 0 else x)
test_num['DAYS_ID_PUBLISH'] = boxcox(test_num['DAYS_ID_PUBLISH'],0.5)

test_num['ENTRANCES_AVG'] = test_num['ENTRANCES_AVG'].apply(lambda x: np.sqrt(x) if pd.notnull(x) else x)

In [8]:
#4 Bins for Own Car Age
def own_car_age_bins(x):
    if pd.isnull(x):
        return 0
    elif x <= 10:
        return 1
    elif x <= 30:
        return 2
    else:
        return 3
    
train['OWN_CAR_AGE_BINS'] = train['OWN_CAR_AGE'].apply(own_car_age_bins)

test['OWN_CAR_AGE_BINS'] = test['OWN_CAR_AGE'].apply(own_car_age_bins)

In [9]:
#Columns to drop because of sparse features
train_num = train_num.drop(['FLAG_MOBIL','FLAG_CONT_MOBILE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_4',\
                            'FLAG_DOCUMENT_7','FLAG_DOCUMENT_10','FLAG_DOCUMENT_12','FLAG_DOCUMENT_15',\
                            'FLAG_DOCUMENT_17','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21',\
                            'OWN_CAR_AGE','DAYS_EMPLOYED','DAYS_EMPLOYED_BIN_1','AMT_REQ_CREDIT_BUREAU_HOUR',\
                            'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON',\
                            'AMT_REQ_CREDIT_BUREAU_QRT'], axis = 1)

test_num = test_num.drop(['FLAG_MOBIL','FLAG_CONT_MOBILE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_4',\
                          'FLAG_DOCUMENT_7','FLAG_DOCUMENT_10','FLAG_DOCUMENT_12','FLAG_DOCUMENT_15',\
                          'FLAG_DOCUMENT_17','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21',\
                          'OWN_CAR_AGE','DAYS_EMPLOYED','DAYS_EMPLOYED_BIN_1','AMT_REQ_CREDIT_BUREAU_HOUR',\
                            'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON',\
                            'AMT_REQ_CREDIT_BUREAU_QRT'], axis = 1)

In [10]:
#FillNA Columns
train_num.iloc[:,25:76] = train_num.iloc[:,25:76].fillna(0).astype(int)
train_num['AMT_REQ_CREDIT_BUREAU_YEAR'] = train_num['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(11).astype(int)

test_num.iloc[:,24:75] = test_num.iloc[:,24:75].fillna(0).astype(int)
test_num['AMT_REQ_CREDIT_BUREAU_YEAR'] = test_num['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(11).astype(int)

In [11]:
#Fill Median Columns
train_num.loc[:,'AMT_ANNUITY'] = train_num.loc[:,'AMT_ANNUITY'].fillna(train_num.AMT_ANNUITY.median())
train_num.loc[:,'AMT_GOODS_PRICE'] = train_num.loc[:,'AMT_GOODS_PRICE'].fillna(train_num.AMT_GOODS_PRICE.median())
train_num.loc[:,'CNT_FAM_MEMBERS'] = train_num.loc[:,'CNT_FAM_MEMBERS'].fillna(train_num.CNT_FAM_MEMBERS.median())

test_num.loc[:,'AMT_ANNUITY'] = test_num.loc[:,'AMT_ANNUITY'].fillna(test_num.AMT_ANNUITY.median())
test_num.loc[:,'AMT_GOODS_PRICE'] = test_num.loc[:,'AMT_GOODS_PRICE'].fillna(test_num.AMT_GOODS_PRICE.median())
test_num.loc[:,'CNT_FAM_MEMBERS'] = test_num.loc[:,'CNT_FAM_MEMBERS'].fillna(test_num.CNT_FAM_MEMBERS.median())

In [12]:
train_num['children_ratio'] = train_num['CNT_CHILDREN'] / train_num['CNT_FAM_MEMBERS']
train_num['credit_to_annuity_ratio'] = train_num['AMT_CREDIT'] / train_num['AMT_ANNUITY']
train_num['credit_to_goods_ratio'] = train_num['AMT_CREDIT'] / train_num['AMT_GOODS_PRICE']
train_num['credit_to_income_ratio'] = train_num['AMT_CREDIT'] / train_num['AMT_INCOME_TOTAL']
train_num['income_credit_percentage'] = train_num['AMT_INCOME_TOTAL'] / train_num['AMT_CREDIT']
train_num['income_per_child'] = train_num['AMT_INCOME_TOTAL'] / (1 + train_num['CNT_CHILDREN'])
train_num['income_per_person'] = train_num['AMT_INCOME_TOTAL'] / train_num['CNT_FAM_MEMBERS']
train_num['payment_rate'] = train_num['AMT_ANNUITY'] / train_num['AMT_CREDIT']
train_num['phone_to_birth_ratio'] = train_num['DAYS_LAST_PHONE_CHANGE'] / train_num['DAYS_BIRTH']
train_num['cnt_non_child'] = train_num['CNT_FAM_MEMBERS'] - train_num['CNT_CHILDREN']
train_num['child_to_non_child_ratio'] = train_num['CNT_CHILDREN'] / (1+ train_num['cnt_non_child'])
train_num['income_per_non_child'] = train_num['AMT_INCOME_TOTAL'] / (1 + train_num['cnt_non_child'])
train_num['credit_per_person'] = train_num['AMT_CREDIT'] / train_num['CNT_FAM_MEMBERS']
train_num['credit_per_child'] = train_num['AMT_CREDIT'] / (1 + train_num['CNT_CHILDREN'])
train_num['credit_per_non_child'] = train_num['AMT_CREDIT'] / (1 + train_num['cnt_non_child'])

test_num['children_ratio'] = test_num['CNT_CHILDREN'] / test_num['CNT_FAM_MEMBERS']
test_num['credit_to_annuity_ratio'] = test_num['AMT_CREDIT'] / test_num['AMT_ANNUITY']
test_num['credit_to_goods_ratio'] = test_num['AMT_CREDIT'] / test_num['AMT_GOODS_PRICE']
test_num['credit_to_income_ratio'] = test_num['AMT_CREDIT'] / test_num['AMT_INCOME_TOTAL']
test_num['income_credit_percentage'] = test_num['AMT_INCOME_TOTAL'] / test_num['AMT_CREDIT']
test_num['income_per_child'] = test_num['AMT_INCOME_TOTAL'] / (1 + test_num['CNT_CHILDREN'])
test_num['income_per_person'] = test_num['AMT_INCOME_TOTAL'] / test_num['CNT_FAM_MEMBERS']
test_num['payment_rate'] = test_num['AMT_ANNUITY'] / test_num['AMT_CREDIT']
test_num['phone_to_birth_ratio'] = test_num['DAYS_LAST_PHONE_CHANGE'] / test_num['DAYS_BIRTH']
test_num['cnt_non_child'] = test_num['CNT_FAM_MEMBERS'] - test_num['CNT_CHILDREN']
test_num['child_to_non_child_ratio'] = test_num['CNT_CHILDREN'] / (1 + test_num['cnt_non_child'])
test_num['income_per_non_child'] = test_num['AMT_INCOME_TOTAL'] / (1 + test_num['cnt_non_child'])
test_num['credit_per_person'] = test_num['AMT_CREDIT'] / test_num['CNT_FAM_MEMBERS']
test_num['credit_per_child'] = test_num['AMT_CREDIT'] / (1 + test_num['CNT_CHILDREN'])
test_num['credit_per_non_child'] = test_num['AMT_CREDIT'] / (1 + test_num['cnt_non_child']) 

The minimum supported version is 2.4.6



In [13]:
###### Categorical Variables
from sklearn.preprocessing import LabelEncoder

train_cat = train.select_dtypes(include='object').fillna('Missing')
test_cat = test.select_dtypes(include='object').fillna('Missing')

In [14]:
#Drop columns that are sparse
train_cat = train_cat.drop('ORGANIZATION_TYPE', axis = 1)
test_cat = test_cat.drop('ORGANIZATION_TYPE', axis = 1)

In [15]:
#Label Encoding
train_cat = train_cat.apply(LabelEncoder().fit_transform)

test_cat = test_cat.apply(LabelEncoder().fit_transform)

In [16]:
#Concatenating Train & Test Numerical & Categorical Features
train_clean = pd.concat([train_num, train_cat], axis = 1)

test_clean = pd.concat([test_num, test_cat], axis = 1)

In [17]:
# from imblearn.over_sampling import SMOTE

# sm = SMOTE(random_state=42)
# X_res, y_res = sm.fit_sample(train_num_scaled, train.TARGET)

# from collections import Counter
# print('Resampled dataset shape {}'.format(Counter(y_res)))

## Bureau Balance Data

In [18]:
#### Bureau Balance ####
bureau_balance_data_grouped = pd.get_dummies(bureau_balance_data).groupby('SK_ID_BUREAU', as_index=False).agg({'STATUS_1':'count','MONTHS_BALANCE':min,\
                                                                                 'STATUS_C':sum,'STATUS_0':sum,'STATUS_X':sum})

bureau_balance_data_grouped = bureau_balance_data_grouped.rename(columns={'STATUS_1':'BALANCE_COUNT'})

bureau_balance_data_grouped['STATUS_X_RATIO'] = bureau_balance_data_grouped['STATUS_X'] / bureau_balance_data_grouped['BALANCE_COUNT'].astype(float)
bureau_balance_data_grouped['STATUS_C_RATIO'] = bureau_balance_data_grouped['STATUS_C'] / bureau_balance_data_grouped['BALANCE_COUNT'].astype(float)
bureau_balance_data_grouped['STATUS_0_RATIO'] = bureau_balance_data_grouped['STATUS_0'] / bureau_balance_data_grouped['BALANCE_COUNT'].astype(float)

bureau_data = bureau_data.merge(bureau_balance_data_grouped, how = 'left')

bureau_data[['MONTHS_BALANCE', 'STATUS_X', 'STATUS_C', 'BALANCE_COUNT', 'STATUS_0', 'STATUS_X_RATIO','STATUS_C_RATIO', 'STATUS_0_RATIO']] = bureau_data[['MONTHS_BALANCE', 'STATUS_X', 'STATUS_C', 'BALANCE_COUNT', 'STATUS_0', 'STATUS_X_RATIO','STATUS_C_RATIO', 'STATUS_0_RATIO']].fillna(0)

## Bureau Data

In [19]:
##### Bureau Data #####
bureau_data_grouped = bureau_data.select_dtypes(exclude='object').drop('SK_ID_BUREAU', axis = 1).groupby('SK_ID_CURR').sum()
#bureau_data_grouped.columns = ['_'.join(col) if col != ('SK_ID_CURR', '') else col[0] for col in bureau_data_grouped.columns]
bureau_data_grouped = bureau_data_grouped.reset_index()

#Past Loan Count
loan_count = bureau_data[['SK_ID_CURR','SK_ID_BUREAU']].groupby('SK_ID_CURR', \
                                                   as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU':'LOAN_COUNT'})

bureau_data_grouped = bureau_data_grouped.merge(loan_count, how = 'left')

#Unique Loan Types
unique_loan_count = bureau_data[['SK_ID_CURR','CREDIT_TYPE']].groupby('SK_ID_CURR',\
                                                                      as_index=False).agg({'CREDIT_TYPE':'nunique'}).rename(columns={'CREDIT_TYPE':'UNIQUE_CREDIT_TYPES'})

bureau_data_grouped = bureau_data_grouped.merge(unique_loan_count, how = 'left')

#Total Active Loans
bureau_data['CREDIT_ACTIVE_BINARY'] = bureau_data['CREDIT_ACTIVE'].apply(lambda x: 1 if x == 'Active' else 0)

active_loan_count = bureau_data[['SK_ID_CURR','CREDIT_ACTIVE_BINARY']].groupby('SK_ID_CURR', \
                                                   as_index=False)['CREDIT_ACTIVE_BINARY'].sum().rename(columns = {'CREDIT_ACTIVE_BINARY':'ACTIVE_LOANS'})

bureau_data_grouped = bureau_data_grouped.merge(active_loan_count, how = 'left')

In [20]:
#Days Between Successive Past Applications
grp = bureau_data[['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT']].groupby(by = ['SK_ID_CURR'])
grp1 = grp.apply(lambda x: x.sort_values(['DAYS_CREDIT'], ascending = False)).reset_index(drop = True)

grp1['DAYS_CREDIT1'] = grp1['DAYS_CREDIT']*-1
grp1['DAYS_DIFF'] = grp1.groupby(by = ['SK_ID_CURR'])['DAYS_CREDIT1'].diff()
grp1['DAYS_DIFF'] = grp1['DAYS_DIFF'].fillna(0).astype('uint32')
del grp1['DAYS_CREDIT1'], grp1['DAYS_CREDIT']

past_app_days = grp1.groupby('SK_ID_CURR', as_index=False)['DAYS_DIFF'].mean()

bureau_data_grouped = bureau_data_grouped.merge(past_app_days, how = 'left')

In [21]:
# Days Credit Expires
bureau_data['CREDIT_ENDDATE_BINARY'] = bureau_data['DAYS_CREDIT_ENDDATE'].apply(lambda x: 0 if x < 0 else 1) 

B1 = bureau_data.loc[bureau_data['CREDIT_ENDDATE_BINARY'] == 1]

grp = B1[['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT_ENDDATE']].groupby(by = ['SK_ID_CURR'])
# Sort the values of CREDIT_ENDDATE for each customer ID 
grp1 = grp.apply(lambda x: x.sort_values(['DAYS_CREDIT_ENDDATE'], ascending = True)).reset_index(drop = True)
del grp

grp1['DAYS_ENDDATE_DIFF'] = grp1.groupby(by = ['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].diff()
grp1['DAYS_ENDDATE_DIFF'] = grp1['DAYS_ENDDATE_DIFF'].fillna(0).astype('uint32')
del grp1['DAYS_CREDIT_ENDDATE']

credit_expires_days = grp1.groupby('SK_ID_CURR', as_index = False)['DAYS_ENDDATE_DIFF'].mean()

bureau_data_grouped = bureau_data_grouped.merge(credit_expires_days, how = 'left')

In [22]:
# % Active Loans
bureau_data_grouped['ACTIVE_LOAN_PERC'] = bureau_data_grouped['ACTIVE_LOANS'] / bureau_data_grouped['LOAN_COUNT'].astype(float)

## Credit Card Data

In [23]:
#Loans per Customer
grp = cc_data.groupby(by = ['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index().rename(index = str, columns = {'SK_ID_PREV': 'NO_LOANS'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

In [24]:
#Number of Installments paid by customer per loan
grp = cc_data.groupby(by = ['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].max().reset_index().rename(index = str, columns = {'CNT_INSTALMENT_MATURE_CUM': 'NO_INSTALMENTS'})
grp = grp.groupby(by = ['SK_ID_CURR'])['NO_INSTALMENTS'].sum().reset_index().rename(index = str, columns = {'NO_INSTALMENTS': 'TOTAL_INSTALMENTS'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

In [25]:
#Avg Number of Installments paid per Loan
cc_data['INSTALLMENTS_PER_LOAN'] = (cc_data['TOTAL_INSTALMENTS']/cc_data['NO_LOANS']).astype('uint32')

In [26]:
#Avg % Loading of Credit Limit Per Customer
cc_data['AMT_CREDIT_LIMIT_ACTUAL1'] = cc_data['AMT_CREDIT_LIMIT_ACTUAL']

def f(x1, x2):
    
    balance = x1.max() + 1
    limit = x2.max() + 1
    
    return (balance/limit)

# Calculate the ratio of Amount Balance to Credit Limit - CREDIT LOAD OF CUSTOMER 
# This is done for each Credit limit value per loan per Customer 

grp = cc_data.groupby(by = ['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply(lambda x: f(x.AMT_BALANCE, x.AMT_CREDIT_LIMIT_ACTUAL1)).reset_index().rename(index = str, columns = {0: 'CREDIT_LOAD1'})
del cc_data['AMT_CREDIT_LIMIT_ACTUAL1']

# We now calculate the mean Credit load of All Loan transactions of Customer 
grp1 = grp.groupby(by = ['SK_ID_CURR'])['CREDIT_LOAD1'].mean().reset_index().rename(index = str, columns = {'CREDIT_LOAD1': 'CREDIT_LOAD'})

cc_data = cc_data.merge(grp1, on = ['SK_ID_CURR'], how = 'left')

In [27]:
# Function to calculate number of times Days Past Due occurred 

def f(DPD):
    
    # DPD is a series of values of SK_DPD for each of the groupby combination 
    # We convert it to a list to get the number of SK_DPD values NOT EQUALS ZERO
    x = DPD.tolist()
    c = 0
    for i,j in enumerate(x):
        if j != 0:
            c += 1
    
    return c 

grp = cc_data.groupby(by = ['SK_ID_CURR', 'SK_ID_PREV']).apply(lambda x: f(x.SK_DPD)).reset_index().rename(index = str, columns = {0: 'NO_DPD'})
grp1 = grp.groupby(by = ['SK_ID_CURR'])['NO_DPD'].mean().reset_index().rename(index = str, columns = {'NO_DPD' : 'DPD_COUNT'})

cc_data = cc_data.merge(grp1, on = ['SK_ID_CURR'], how = 'left')


In [28]:
#Average of Days Past Due Per Customer
grp = cc_data.groupby(by= ['SK_ID_CURR'])['SK_DPD'].mean().reset_index().rename(index = str, columns = {'SK_DPD': 'AVG_DPD'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

In [29]:
# % of Minimum Payments Missed
def f(min_pay, total_pay):
    
    M = min_pay.tolist()
    T = total_pay.tolist()
    P = len(M)
    c = 0 
    # Find the count of transactions when Payment made is less than Minimum Payment 
    for i in range(len(M)):
        if T[i] < M[i]:
            c += 1  
    return (100*c)/P

grp = cc_data.groupby(by = ['SK_ID_CURR']).apply(lambda x: f(x.AMT_INST_MIN_REGULARITY, x.AMT_PAYMENT_CURRENT)).reset_index().rename(index = str, columns = { 0 : 'PERCENTAGE_MISSED_PAYMENTS'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

In [30]:
# Ratio of Cash vs Card Swipes

grp = cc_data.groupby(by = ['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].sum().reset_index().rename(index = str, columns = {'AMT_DRAWINGS_ATM_CURRENT' : 'DRAWINGS_ATM'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

In [31]:
grp = cc_data.groupby(by = ['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index().rename(index = str, columns = {'AMT_DRAWINGS_CURRENT' : 'DRAWINGS_TOTAL'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

cc_data['CASH_CARD_RATIO1'] = (cc_data['DRAWINGS_ATM']/cc_data['DRAWINGS_TOTAL'])*100

grp = cc_data.groupby(by = ['SK_ID_CURR'])['CASH_CARD_RATIO1'].mean().reset_index().rename(index = str, columns ={ 'CASH_CARD_RATIO1' : 'CASH_CARD_RATIO'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

In [32]:
#Avg Drawing per Customer

grp = cc_data.groupby(by = ['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index().rename(index = str, columns = {'AMT_DRAWINGS_CURRENT' : 'TOTAL_DRAWINGS'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

In [33]:
grp = cc_data.groupby(by = ['SK_ID_CURR'])['CNT_DRAWINGS_CURRENT'].sum().reset_index().rename(index = str, columns = {'CNT_DRAWINGS_CURRENT' : 'NO_DRAWINGS'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

cc_data['DRAWINGS_RATIO1'] = (cc_data['TOTAL_DRAWINGS']/cc_data['NO_DRAWINGS'])*100

grp = cc_data.groupby(by = ['SK_ID_CURR'])['DRAWINGS_RATIO1'].mean().reset_index().rename(index = str, columns ={ 'DRAWINGS_RATIO1' : 'DRAWINGS_RATIO'})
cc_data = cc_data.merge(grp, on = ['SK_ID_CURR'], how = 'left')

In [34]:
##### Credit Card Data #####
cc_data_one_hot = pd.concat([cc_data['SK_ID_PREV'], \
                            pd.get_dummies(cc_data.select_dtypes(include=['object']), drop_first = True)],\
                            axis = 1)

cc_data_one_hot = cc_data_one_hot.drop(['NAME_CONTRACT_STATUS_Approved','NAME_CONTRACT_STATUS_Demand','NAME_CONTRACT_STATUS_Demand',\
                      'NAME_CONTRACT_STATUS_Refused','NAME_CONTRACT_STATUS_Sent proposal','NAME_CONTRACT_STATUS_Signed'], axis = 1)

cc_data_one_hot_grouped = cc_data_one_hot.groupby('SK_ID_PREV', as_index=False).sum()

cc_data_numeric_grouped = cc_data.select_dtypes(exclude=['object']).groupby('SK_ID_PREV', as_index = False).agg(['count', sum, 'mean', min, max])

cc_data_numeric_grouped.columns = ['_CC_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in cc_data_numeric_grouped.columns]

filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', cc_data_numeric_grouped.columns.tolist())
filtered_cols.insert(0, 'MONTHS_BALANCE_CC_count')

cc_data_numeric_grouped_filtered = cc_data_numeric_grouped[filtered_cols].reset_index()

cc_data_grouped = cc_data_numeric_grouped_filtered.merge(cc_data_one_hot_grouped, how = 'left')

## Prev App Data

In [35]:
prev_app_data.columns = [x + '_PREV' if x not in ('SK_ID_PREV','SK_ID_CURR') else x for x in prev_app_data.columns]

In [36]:
#Numeric Fields
prev_app_data_num = prev_app_data.select_dtypes(exclude='object')
prev_app_data_num = prev_app_data_num.drop(['RATE_INTEREST_PRIMARY_PREV','RATE_INTEREST_PRIVILEGED_PREV'], axis = 1)

In [37]:
#Categorical Fields
prev_app_data_cat = prev_app_data.select_dtypes(include='object').fillna('Missing')

prev_app_data_cat.NAME_CONTRACT_TYPE_PREV.replace('XNA_PREV', 'Revolving loans_PREV', inplace=True)
prev_app_data_cat.NAME_CLIENT_TYPE_PREV.replace('XNA_PREV', 'Refreshed_PREV', inplace=True)
prev_app_data_cat.NAME_PORTFOLIO_PREV.replace('Cars_PREV', 'Cards_PREV', inplace=True)

prev_app_data_cat = prev_app_data_cat.drop(['FLAG_LAST_APPL_PER_CONTRACT_PREV'], axis = 1)

prev_app_data_cat['NAME_CASH_LOAN_PURPOSE_PREV'] = prev_app_data_cat['NAME_CASH_LOAN_PURPOSE_PREV'].apply(lambda x: 'Other' if x != 'XAP' and x != 'XNA' else x)
prev_app_data_cat['NAME_PAYMENT_TYPE_PREV'] = prev_app_data_cat['NAME_PAYMENT_TYPE_PREV'].apply(lambda x: 'Other' if x != 'Cash through the bank' and x != 'XNA' else x)
prev_app_data_cat['NAME_GOODS_CATEGORY_PREV'] = prev_app_data_cat['NAME_GOODS_CATEGORY_PREV'].apply(lambda x: 'Other' if x not in ['XNA','Mobile',\
                                                                                                                         'Consumer Electronics','Computers','Audio/Video'] else x)
prev_app_data_cat['NAME_SELLER_INDUSTRY_PREV'] = prev_app_data_cat['NAME_SELLER_INDUSTRY_PREV'].apply(lambda x: 'Other' if x not in ['XNA','Connectivity','Consumer Electronics'] else x)

prev_app_data_cat = prev_app_data_cat.apply(LabelEncoder().fit_transform)

prev_app_data_clean = pd.concat([prev_app_data_num, prev_app_data_cat], axis = 1)

In [38]:
#Merging grouped CC data and prev_app_data
prev_app_data_merged = prev_app_data_clean.merge(cc_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

## Installments

In [39]:
installments_data['DAYS_ENTRY_PAYMENT'] = installments_data['DAYS_ENTRY_PAYMENT'].fillna(installments_data['DAYS_ENTRY_PAYMENT'].median())
installments_data['AMT_PAYMENT'] = installments_data['AMT_PAYMENT'].fillna(installments_data['AMT_PAYMENT'].median())

installments_data_grouped = installments_data.groupby('SK_ID_PREV', as_index = False).agg(['count', sum, 'mean', min, max])
installments_data_grouped.columns = ['_INST_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in installments_data_grouped.columns]
installments_data_grouped = installments_data_grouped.reset_index()

filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', installments_data_grouped.columns.tolist())
filtered_cols.insert(0, 'SK_ID_CURR_INST_count')

installments_data_grouped = installments_data_grouped[filtered_cols]

In [40]:
#Merging grouped installments and previous data
prev_app_data_merged = prev_app_data_merged.merge(installments_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

## POS CASH Balance

In [41]:
pos_cash_balance_data['NAME_CONTRACT_STATUS'] = pos_cash_balance_data['NAME_CONTRACT_STATUS'].apply(lambda x: 'Other' if x not in ['Active','Completed','Signed'] else x)
pos_cash_balance_data['NAME_CONTRACT_STATUS'] = LabelEncoder().fit_transform(pos_cash_balance_data['NAME_CONTRACT_STATUS'])

pos_cash_balance_data_grouped = pos_cash_balance_data.groupby('SK_ID_PREV', as_index=False).agg(['count', sum, 'mean', min, max])
pos_cash_balance_data_grouped.columns = ['_POS_'.join(col) if col != ('SK_ID_PREV', '') else col[0] for col in pos_cash_balance_data_grouped.columns]
pos_cash_balance_data_grouped = pos_cash_balance_data_grouped.reset_index()

filtered_cols = filter(lambda x: x[-5:] != 'count' and x[0:10] != 'SK_ID_CURR', pos_cash_balance_data_grouped.columns.tolist())
filtered_cols.insert(0, 'SK_ID_CURR_POS_count')

pos_cash_balance_data_grouped = pos_cash_balance_data_grouped[filtered_cols]

In [42]:
#Merging grouped installments and previous data
prev_app_data_merged = prev_app_data_merged.merge(pos_cash_balance_data_grouped, how = 'left', left_on = 'SK_ID_PREV', right_on = 'SK_ID_PREV')

## Merging DataFrames

In [63]:
#Bureau Data
app_train_merged = train_clean.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')
app_test_merged = test_clean.merge(bureau_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

In [64]:
#Prev App Data
prev_app_data_grouped = prev_app_data_merged.drop('SK_ID_PREV', axis = 1).groupby('SK_ID_CURR', as_index=False).sum()

app_train_merged = app_train_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')
app_test_merged = app_test_merged.merge(prev_app_data_grouped, how = 'left', left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')

## Cleaning final DataFrames

In [65]:
app_train_merged = app_train_merged.replace([np.inf, -np.inf], np.nan)
app_test_merged = app_test_merged.replace([np.inf, -np.inf], np.nan)

In [66]:
### filling NA's with 0

app_train_merged = app_train_merged.fillna(0).drop(['SK_ID_CURR','TARGET'], axis = 1)
app_test_merged = app_test_merged.fillna(0).drop(['SK_ID_CURR'], axis = 1)

In [68]:
poly_transformer = PolynomialFeatures(degree = 1)
poly_transformer.fit(app_train_merged)
train_poly_features = poly_transformer.transform(app_train_merged)

train_subset_poly = pd.DataFrame(train_poly_features, columns = poly_transformer.get_feature_names(input_features = app_train_merged.columns.tolist()))

test_poly_features = poly_transformer.transform(app_test_merged)
test_subset_poly = pd.DataFrame(test_poly_features, columns = poly_transformer.get_feature_names(input_features = app_test_merged.columns.tolist()))

In [74]:
scaler = StandardScaler()
scaler.fit(train_poly_features)
app_train_scaled = scaler.transform(train_poly_features)
app_test_scaled = scaler.transform(test_poly_features)

In [75]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations = 10000, random_state = 42, learning_rate = 0.1)

In [76]:
cat_model.fit(app_train_scaled, train.TARGET)

0:	learn: 0.5916784	total: 1.2s	remaining: 3h 20m 24s
1:	learn: 0.5157939	total: 2.46s	remaining: 3h 25m 26s
2:	learn: 0.4584951	total: 3.5s	remaining: 3h 14m 36s
3:	learn: 0.4141408	total: 4.58s	remaining: 3h 10m 33s
4:	learn: 0.3809245	total: 5.57s	remaining: 3h 5m 25s
5:	learn: 0.3555386	total: 6.57s	remaining: 3h 2m 22s
6:	learn: 0.3356461	total: 7.61s	remaining: 3h 57s
7:	learn: 0.3204010	total: 8.75s	remaining: 3h 2m 8s
8:	learn: 0.3085038	total: 9.78s	remaining: 3h 53s
9:	learn: 0.2992691	total: 10.8s	remaining: 2h 59m 6s
10:	learn: 0.2915536	total: 11.8s	remaining: 2h 58m 6s
11:	learn: 0.2856138	total: 12.8s	remaining: 2h 57m 36s
12:	learn: 0.2809172	total: 13.9s	remaining: 2h 57m 27s
13:	learn: 0.2769184	total: 15.3s	remaining: 3h 1m 44s
14:	learn: 0.2739291	total: 17.3s	remaining: 3h 12m 11s
15:	learn: 0.2714840	total: 18.9s	remaining: 3h 16m 28s
16:	learn: 0.2693095	total: 19.9s	remaining: 3h 15m 9s
17:	learn: 0.2675282	total: 21.1s	remaining: 3h 14m 54s
18:	learn: 0.2661009

<catboost.core._CatBoostBase at 0x1afbe78510>

In [77]:
preds = pd.DataFrame(cat_model.predict_proba(app_test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, preds], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']
submission_cat.to_csv('cat_sub3.csv', index=False)

######

In [None]:
train_merged_subset = train_merged.dropna(thresh=len(train_merged) - 200000000, axis = 1)

In [None]:
train_merged_subset.info(verbose=True, null_counts = True)

In [None]:
column_corr_subset = train_merged_subset.columns.tolist()

In [None]:
len(column_corr_subset)

In [None]:
#col_corr = train_merged_subset.corr()['TARGET'].sort_values()

In [None]:
#column_corr_subset = col_corr[(col_corr >= 0.03) | (col_corr < -0.035)].index.values.tolist()

In [None]:
column_corr_subset.remove('TARGET')

In [None]:
#column_corr_subset.remove('CODE_GENDER_XNA')
#column_corr_subset.remove('NAME_FAMILY_STATUS_Unknown')
#column_corr_subset.remove('NAME_INCOME_TYPE_Maternity leave')

In [None]:
train_subset = train_merged_subset[column_corr_subset]

test_subset = test_merged[column_corr_subset]

In [None]:
imputer = Imputer()
imputer.fit(train_subset)
train_merged_imputed = pd.DataFrame(imputer.transform(train_subset), columns = train_subset.columns)
test_merged_imputed = pd.DataFrame(imputer.transform(test_subset), columns = train_subset.columns)

In [None]:
train_merged_imputed['DAYS_EMPLOYED_^2'] = train_merged_imputed['DAYS_EMPLOYED'] ** 2
#train_merged_imputed['AMT_GOODS_PRICE_^2'] = train_merged_imputed['AMT_GOODS_PRICE'] ** 2
train_merged_imputed['DAYS_CREDIT^2'] = train_merged_imputed['DAYS_CREDIT'] ** 2
#train_merged_imputed['DAYS_CREDIT_median^2'] = train_merged_imputed['DAYS_CREDIT_median'] ** 2
train_merged_imputed['DAYS_BIRTH_^2'] = train_merged_imputed['DAYS_BIRTH'] ** 2
train_merged_imputed['REGION_RATING_CLIENT_W_CITY_^2'] = train_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 2
train_merged_imputed['REGION_RATING_CLIENT_^2'] = train_merged_imputed['REGION_RATING_CLIENT'] ** 2
train_merged_imputed['NAME_INCOME_TYPE_Working_^2'] = train_merged_imputed['NAME_INCOME_TYPE_Working'] ** 2
train_merged_imputed['DAYS_LAST_PHONE_CHANGE_^2'] = train_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 2
train_merged_imputed['EXT_SOURCE_1_^2'] = train_merged_imputed['EXT_SOURCE_1'] ** 2
train_merged_imputed['EXT_SOURCE_2_^2'] = train_merged_imputed['EXT_SOURCE_2'] ** 2
train_merged_imputed['EXT_SOURCE_3_^2'] = train_merged_imputed['EXT_SOURCE_3'] ** 2
train_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^2'] = train_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 2
train_merged_imputed['CODE_GENDER_F_^2']= train_merged_imputed['CODE_GENDER_F'] ** 2

train_merged_imputed['DAYS_EMPLOYED_^3'] = train_merged_imputed['DAYS_EMPLOYED'] ** 3
#train_merged_imputed['AMT_GOODS_PRICE_^3'] = train_merged_imputed['AMT_GOODS_PRICE'] ** 3
train_merged_imputed['DAYS_CREDIT^3'] = train_merged_imputed['DAYS_CREDIT'] ** 3
#train_merged_imputed['DAYS_CREDIT_median^3'] = train_merged_imputed['DAYS_CREDIT_median'] ** 3
train_merged_imputed['DAYS_BIRTH_^3'] = train_merged_imputed['DAYS_BIRTH'] ** 3
train_merged_imputed['REGION_RATING_CLIENT_W_CITY_^3'] = train_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 3
train_merged_imputed['REGION_RATING_CLIENT_^3'] = train_merged_imputed['REGION_RATING_CLIENT'] ** 3
train_merged_imputed['NAME_INCOME_TYPE_Working_^3'] = train_merged_imputed['NAME_INCOME_TYPE_Working'] ** 3
train_merged_imputed['DAYS_LAST_PHONE_CHANGE_^3'] = train_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 3
train_merged_imputed['EXT_SOURCE_1_^3'] = train_merged_imputed['EXT_SOURCE_1'] ** 3
train_merged_imputed['EXT_SOURCE_2_^3'] = train_merged_imputed['EXT_SOURCE_2'] ** 3
train_merged_imputed['EXT_SOURCE_3_^3'] = train_merged_imputed['EXT_SOURCE_3'] ** 3
train_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^3'] = train_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 3
train_merged_imputed['CODE_GENDER_F_^3']= train_merged_imputed['CODE_GENDER_F'] ** 3

test_merged_imputed['DAYS_EMPLOYED_^2'] = test_merged_imputed['DAYS_EMPLOYED'] ** 2
#test_merged_imputed['AMT_GOODS_PRICE_^2'] = test_merged_imputed['AMT_GOODS_PRICE'] ** 2
test_merged_imputed['DAYS_CREDIT_^2'] = test_merged_imputed['DAYS_CREDIT'] ** 2
#test_merged_imputed['DAYS_CREDIT_median^2'] = test_merged_imputed['DAYS_CREDIT_median'] ** 2
test_merged_imputed['DAYS_BIRTH_^2'] = test_merged_imputed['DAYS_BIRTH'] ** 2
test_merged_imputed['REGION_RATING_CLIENT_W_CITY_^2'] = test_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 2
test_merged_imputed['REGION_RATING_CLIENT_^2'] = test_merged_imputed['REGION_RATING_CLIENT'] ** 2
test_merged_imputed['NAME_INCOME_TYPE_Working_^2'] = test_merged_imputed['NAME_INCOME_TYPE_Working'] ** 2
test_merged_imputed['DAYS_LAST_PHONE_CHANGE_^2'] = test_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 2
test_merged_imputed['EXT_SOURCE_1_^2'] = test_merged_imputed['EXT_SOURCE_1'] ** 2
test_merged_imputed['EXT_SOURCE_2_^2'] = test_merged_imputed['EXT_SOURCE_2'] ** 2
test_merged_imputed['EXT_SOURCE_3_^2'] = test_merged_imputed['EXT_SOURCE_3'] ** 2
test_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^2'] = test_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 2
test_merged_imputed['CODE_GENDER_F_^2']= test_merged_imputed['CODE_GENDER_F'] ** 2

test_merged_imputed['DAYS_EMPLOYED_^3'] = test_merged_imputed['DAYS_EMPLOYED'] ** 3
#test_merged_imputed['AMT_GOODS_PRICE_^3'] = test_merged_imputed['AMT_GOODS_PRICE'] ** 3
test_merged_imputed['DAYS_CREDIT^3'] = test_merged_imputed['DAYS_CREDIT'] ** 3
#test_merged_imputed['DAYS_CREDIT_median^3'] = test_merged_imputed['DAYS_CREDIT_median'] ** 3
test_merged_imputed['DAYS_BIRTH_^3'] = test_merged_imputed['DAYS_BIRTH'] ** 3
test_merged_imputed['REGION_RATING_CLIENT_W_CITY_^3'] = test_merged_imputed['REGION_RATING_CLIENT_W_CITY'] ** 3
test_merged_imputed['REGION_RATING_CLIENT_^3'] = test_merged_imputed['REGION_RATING_CLIENT'] ** 3
test_merged_imputed['NAME_INCOME_TYPE_Working_^3'] = test_merged_imputed['NAME_INCOME_TYPE_Working'] ** 3
test_merged_imputed['DAYS_LAST_PHONE_CHANGE_^3'] = test_merged_imputed['DAYS_LAST_PHONE_CHANGE'] ** 3
test_merged_imputed['EXT_SOURCE_1_^3'] = test_merged_imputed['EXT_SOURCE_1'] ** 3
test_merged_imputed['EXT_SOURCE_2_^3'] = test_merged_imputed['EXT_SOURCE_2'] ** 3
test_merged_imputed['EXT_SOURCE_3_^3'] = test_merged_imputed['EXT_SOURCE_3'] ** 3
test_merged_imputed['NAME_EDUCATION_TYPE_Higher education_^3'] = test_merged_imputed['NAME_EDUCATION_TYPE_Higher education'] ** 3
test_merged_imputed['CODE_GENDER_F_^3']= test_merged_imputed['CODE_GENDER_F'] ** 3

In [None]:
poly_transformer = PolynomialFeatures(degree = 1)
poly_transformer.fit(train_merged_imputed)
train_poly_features = poly_transformer.transform(train_merged_imputed)

In [None]:
train_subset_poly = pd.DataFrame(train_poly_features, columns = poly_transformer.get_feature_names(
    input_features = train_merged_imputed.columns.tolist()
))

test_poly_features = poly_transformer.transform(test_merged_imputed)
test_subset_poly = pd.DataFrame(test_poly_features, columns = poly_transformer.get_feature_names(input_features = test_merged_imputed.columns.tolist()))

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(train_subset_poly)
train_scaled = scaler.transform(train_subset_poly)
test_scaled = scaler.transform(test_subset_poly)

In [None]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations = 1000, random_state = 42, learning_rate = 0.25)

In [None]:
#cat_model.fit(train_scaled, train.TARGET)

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
#xgb_model = XGBClassifier(n_estimators = 500, silent=True, learning_rate = 0.1)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
#xgb_model.fit(train_scaled, train.TARGET)

In [None]:
test_y_cat = pd.DataFrame(xgb_model.predict_proba(test_scaled))
submission_cat = pd.concat([test.SK_ID_CURR, test_y_cat], axis=1).drop(0, axis = 1)
submission_cat.columns = ['SK_ID_CURR', 'Target']
submission_cat.to_csv('xgb_model10.csv', index=False)

In [None]:
#submission_cat['Target'] = submission_cat['Target'].apply(lambda x: (x - min_sub_cat) / (max_sub_cat - min_sub_cat))

In [None]:
#cat_submission = pd.read_csv('cat_lr75.csv')
xgb_submission = pd.read_csv('xgb1.csv')

In [None]:
submission_cat['Target'] = (submission_cat['Target']+ xgb_submission['Target']) /2

In [None]:
submission_cat.to_csv('combined11.csv', index = False)

In [None]:
import lightgbm

In [None]:
train_data = lightgbm.Dataset(train_scaled, train.TARGET)

In [None]:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

In [None]:
model = lightgbm.train(parameters,
                       train_data,
#                       valid_sets=test_data,
                       num_boost_round=10000,
#                       early_stopping_rounds=100
                      )

In [None]:
preds = pd.DataFrame(model.predict(test_scaled))

In [None]:
submission_lgbm = pd.concat([test.SK_ID_CURR, preds], axis = 1)

In [None]:
submission_lgbm.columns = ['SK_ID_CURR', 'Target']

In [None]:
submission_cat.to_csv('lgbm2.csv', index=False)

In [None]:
submission_cat.head()

In [None]:
pwd

In [None]:
combined_10 = pd.read_csv('combined10.csv')

In [None]:
combined_10['Target'] = (combined_10['Target'] + submission_cat['Target']) /2

In [None]:
combined_10.to_csv('new_combined.csv', index=False)