In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from catboost import CatBoostClassifier
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [17]:
def fill_missing_values(data):
    
    '''
    Function to input missing values based on the column object type
    '''
    
    cols = list(data.columns)
    for col in cols:
        if data[col].dtype == 'int64' or data[col].dtype == 'float64':
        
            data[col] = data[col].fillna(data[col].mean())
            
        else:
            data[col] = data[col].fillna(data[col].mode()[0])
            
    return data
 
def one_hot_encoding(traindata, *args):
    
    for ii in args:
        traindata = pd.get_dummies(traindata, prefix=[ii], columns=[ii])
        
    return traindata
 
def drop_columns(traindata, *args):
    
    columns = []
    for _ in args:
        columns.append(_)
        
    traindata = traindata.drop(columns, axis=1)
        
    return traindata

def make_submission(prediction, filename):
    sample = pd.read_csv('SampleSubmission.csv')
    test = pd.read_csv('Test.csv')
    sample.Applicant_ID = test.Applicant_ID
    sample.default_status = prediction
    sample.to_csv(filename, index = False)
 
def process(traindata):
    
    cols = list(traindata.columns)
    for _ in cols:
        traindata[_] = np.where(traindata[_] == np.inf, -999, traindata[_])
        traindata[_] = np.where(traindata[_] == np.nan, -999, traindata[_])
        traindata[_] = np.where(traindata[_] == -np.inf, -999, traindata[_])
        
    return traindata
 
def show_evaluation(pred, true):
  print(f'Default score: {score(true.values, pred)}')
  print(f'Accuracy is: {accuracy_score(true, pred)}')
  print(f'F1 is: {f1_score(pred, true.values, average="weighted")}')
 
def freq_encode(data, cols):
    for i in cols:
        encoding = data.groupby(i).size()
        encoding = encoding/len(data)
        data[i + '_enc'] = data[i].map(encoding)
    return data
 
 
def mean_target(data, cols):
    kf = KFold(5)
    a = pd.DataFrame()
    for tr_ind, val_ind in kf.split(data):
        X_tr, X_val= data.iloc[tr_ind].copy(), data.iloc[val_ind].copy()
        for col in cols:
            means = X_val[col].map(X_tr.groupby(col).default_status.mean())
            X_val[col + '_mean_target'] = means + 0.0001
        a = pd.concat((a, X_val))
    return a
        
def scale_data(data):
  
  data = scaler.transform(data)
  #testdata = scaler.transform(testdata)
  data = pd.DataFrame(data)
 
  return data

In [8]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sample = pd.read_csv('SampleSubmission.csv')

In [9]:
target = train.default_status
target_numbers = {'yes': 1,
                  'no': 0}

target = target.map(target_numbers)

In [10]:
ntrain = train.shape[0]
ntest = test.shape[0]
df = pd.concat((train, test)).reset_index(drop=True)
df.shape

(80000, 52)

In [11]:
df = df.drop(['Applicant_ID'], axis=1)

df.default_status = target
df = freq_encode(df, ['form_field47'])
print(df.shape)
df = mean_target(df, ['form_field47'])
df.shape

(80000, 52)


(80000, 53)

In [12]:
df['form_field47_encoded'] = df['form_field47'].astype('category')
df['form_field47_encoded'] = df['form_field47_encoded'].cat.codes
df.form_field47_encoded.value_counts()

0    51840
1    28160
Name: form_field47_encoded, dtype: int64

# GENERATING MORE FEATURES

In [13]:
feature_cols = ['form_field1', 'form_field2', 'form_field42',
                'form_field6']

new = pd.DataFrame()
for i in feature_cols:
    for j in feature_cols:
        if i != j:
            new[i + '*' + j] = df[i] * df[j]
            new[i + '/' + j] = df[i] / df[j]
            #new[i + '-' + j] = df[i] - df[j]
            new[i + '*' + j] = df[i] + df[j]
            
    new['_log' + i] = np.log(df[i])
    new['_sqrt' + i] = np.sqrt(df[i])
    
df1 = df.copy()
df = pd.concat((df, new), axis=1)
df.shape

  result = getattr(ufunc, method)(*inputs, **kwargs)


(80000, 86)

In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)

In [15]:
df = df.drop(['form_field47'], axis=1)
df.shape
 
df = df.fillna(-999)
df = process(df)
data = df.copy()
 
train2 = data[:ntrain].copy()
train2.drop(['default_status'], axis=1, inplace=True)
 
test2 = data[ntrain:(ntest+ntrain)].copy()
test2.drop(['default_status'], axis=1, inplace=True)
test2 = test2.reset_index(drop=True)
 
print(train2.shape, test2.shape)

(56000, 84) (24000, 84)


In [16]:
pca.fit(train2)

train3 = pca.transform(train2)
train2 = pd.concat((train2, pd.DataFrame(train3)), axis=1)
test3 = pca.transform(test2)
test2 = pd.concat((test2, pd.DataFrame(test3)), axis=1)
print(train2.shape, test2.shape)

(56000, 94) (24000, 94)


In [18]:
traindata = train2
testdata = test2
 
scaler = preprocessing.StandardScaler().fit(traindata)

In [19]:
traindata = scale_data(traindata)
testdata = scale_data(testdata)

# TRAINING 

In [20]:
nsplit=20

kf = StratifiedKFold(n_splits=nsplit, shuffle=True)

lgbm = CatBoostClassifier(n_estimators=50000, max_depth=8, 
                          random_state=14, learning_rate=0.033, 
                          use_best_model=True, task_type='CPU', 
                          eval_metric='AUC')

lgbm = XGBClassifier(n_estimators=100000, max_depth=8, 
                     booster='gbtree', base_score=0.7,
                     learning_rate=0.033, reg_lambda=30,
                     subsample=0.9, colsample_bytree=0.9,
                     eval_metric='auc', random_state=20920)

lgbm = LGBMClassifier(max_depth=8, num_leaves=64, boosting_type='gbdt', 
                      learning_rate=0.01, n_estimators=50000, subsample=0.9, 
                      eval_metric='auc', colsample_bytree=0.9, random_state=2)
            
pred_test2 = np.zeros((len(test2), 2))

for (train_index,test_index) in kf.split(pd.DataFrame(traindata), target):
    
    X_train,X_test = pd.DataFrame(traindata).iloc[train_index], pd.DataFrame(traindata).iloc[test_index]
    y_train,y_test = train.default_status.iloc[train_index],train.default_status.iloc[test_index]
    lgbm.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_test, y_test)], verbose=50)
    pred_test2 += lgbm.predict_proba(testdata)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's binary_logloss: 0.484584
[100]	valid_0's binary_logloss: 0.45362
[150]	valid_0's binary_logloss: 0.437704
[200]	valid_0's binary_logloss: 0.429592
[250]	valid_0's binary_logloss: 0.425149
[300]	valid_0's binary_logloss: 0.422397
[350]	valid_0's binary_logloss: 0.420494
[400]	valid_0's binary_logloss: 0.418729
[450]	valid_0's binary_logloss: 0.417655
[500]	valid_0's binary_logloss: 0.416945
[550]	valid_0's binary_logloss: 0.416523
[600]	valid_0's binary_logloss: 0.416067
[650]	valid_0's binary_logloss: 0.415967
[700]	valid_0's binary_logloss: 0.415772
[750]	valid_0's binary_logloss: 0.41542
[800]	valid_0's binary_logloss: 0.415422
[850]	valid_0's binary_logloss: 0.415446
[900]	valid_0's binary_logloss: 0.41553
Early stopping, best iteration is:
[820]	valid_0's binary_logloss: 0.415337
Training until validation scores don't improve for 100 rounds
[50]	valid_0's binary_logloss: 0.478704
[100]	valid_0's binary_log

[600]	valid_0's binary_logloss: 0.410077
[650]	valid_0's binary_logloss: 0.410163
[700]	valid_0's binary_logloss: 0.410203
Early stopping, best iteration is:
[618]	valid_0's binary_logloss: 0.410023
Training until validation scores don't improve for 100 rounds
[50]	valid_0's binary_logloss: 0.479043
[100]	valid_0's binary_logloss: 0.444811
[150]	valid_0's binary_logloss: 0.426305
[200]	valid_0's binary_logloss: 0.415749
[250]	valid_0's binary_logloss: 0.410338
[300]	valid_0's binary_logloss: 0.406555
[350]	valid_0's binary_logloss: 0.404094
[400]	valid_0's binary_logloss: 0.402776
[450]	valid_0's binary_logloss: 0.401517
[500]	valid_0's binary_logloss: 0.400708
[550]	valid_0's binary_logloss: 0.400001
[600]	valid_0's binary_logloss: 0.399711
[650]	valid_0's binary_logloss: 0.399484
[700]	valid_0's binary_logloss: 0.399141
[750]	valid_0's binary_logloss: 0.399004
[800]	valid_0's binary_logloss: 0.398817
[850]	valid_0's binary_logloss: 0.398554
[900]	valid_0's binary_logloss: 0.398313
[9

[850]	valid_0's binary_logloss: 0.412381
[900]	valid_0's binary_logloss: 0.412472
[950]	valid_0's binary_logloss: 0.412506
[1000]	valid_0's binary_logloss: 0.412506
Early stopping, best iteration is:
[934]	valid_0's binary_logloss: 0.412327


In [21]:
pred_test_avg = pred_test2/nsplit
pred_test2 = pred_test_avg[:,1]
pred_test2[:20]

array([0.29167586, 0.44113823, 0.37187728, 0.72132975, 0.12927763,
       0.37456067, 0.36216038, 0.57949308, 0.44301859, 0.28477819,
       0.12845561, 0.03396455, 0.61630094, 0.03263858, 0.44867898,
       0.62573149, 0.29946163, 0.0420398 , 0.09430652, 0.72916864])

In [22]:
make_submission(pred_test2, '20lgbfold-8max-depth1.csv')

The final result was gotten from the blending and ensemble of five different models;
    -One LightGBM
    -One Xgboost
    -Three catboost predictions
    
All models were trained with the hyperparameters above and features generated apart from the other two catboost
models that were trained with different amount of features. One of them used only the features provided (as the baseline), while the other used 50 more features generated using PCA from sklearn.decomposition

How the predictions were blended based on their performance on the LB

In [None]:
xg = pd.read_csv('20xgbfold-8max-depth1.csv')  #0.84323
lg = pd.read_csv('LGB_all_features.csv')  #0.8428
cat0 = pd.read_csv('cat_80features.csv')   #0.84393
cat1 = pd.read_csv('20catfold-8max-depth2.csv')  #0.84442
cat2 = pd.read_csv('10catfold-8max-depth2.csv')   #0.84441

b9 = (lg.default_status*0.1 + xg.default_status*0.2 +  cat0.default_status*0.1 +  
      cat1.default_status*0.3 + cat2.default_status * 0.3)

make_submission(b9, 'b9.csv')

submission b9 was the best result on the LB which I could reproduce with a score of 0.844741368813585.
However my final submission had a score of 0.844743419216839 butI could not find the right predictions I blended to get that as my Jupyter notebook wasn't saved