## Imports


In [0]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
import seaborn as sns
import gc
import lightgbm as lgb
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, power_transform, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, roc_curve
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense,Dropout
from keras.callbacks import EarlyStopping
from keras import metrics
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# Definizione funzioni

In [0]:
'''
Method that apply the label encoder to the features
of the dataset passed in input.
The feature inside 'black_list' are not considered for the
label encoding.
@param df : the dataset input
@return the dataset with label encoding features
'''
def use_label_encoder(df):
    black_list = ['CODE_GENDER']
    le = LabelEncoder()
    le_count = 0
    converted_columns = []
    for col in df:
        if df[col].dtype == 'object' and col not in black_list:
            if len(list(df[col].unique())) <= 2:
                le.fit(df[col])
                df[col] = le.transform(df[col])
                le_count += 1
                converted_columns.append(col)
    return df


In [0]:
'''
Method that return a list with all the feature names that contains
with days greater than 36500.
@param df : the dataset input
@return a list with features names
'''
def get_features_anomalies_on_days(training_data):
  result = []
  for x in training_data.columns:
    # if columns starts with DAYS and value >= 100 (years) then return a list with column's name
    cond = training_data[training_data[x] >= 36500].count()[1] > 0
    if x.startswith('DAYS') and cond:
      result.append(x)
  return result

'''
Method that correct inplace the anomalies days of
the dataset passed in input 
@param df : the dataset input
@param list_of_features : list with anomalies days features
'''
# if value equals 365243 then replace 365243 with nan and add a column with true 
def correct_anomalies_on_days(data,list_of_features):
  for f in list_of_features:
    data[f+"_ANOM"] = data[f] >= 365243
    data[f].replace({365243: np.nan}, inplace = True)


'''
Method that applies the absolute value to the features
that contains days in negative of the dataset passed in
input
@param df : the dataset input
@return the dataset modified
'''
# if column starts with DAYS and value is negative then apply absolute function
def correct_negative_days(data):
  for x in data.columns:
    if x.startswith('DAYS') and data[data[x] < 0].count()[1] > 0:
      data[x] = abs(data[x])
  return data

In [0]:
'''
Method that reads a dataset and does some operations
of cleaning
@param file_path : the file.csv of the dataset to read
@return a dataset as dataframe 
'''
def read_csv_and_clean(file_path, out_path = '/content/'):
  print("reading: " + file_path)
  # read csv
  res = pd.read_csv(file_path)

  # replace XNA and XAP with NaN
  res.replace(to_replace =['XNA', 'XAP'], value = np.nan, inplace = True)

  # transform categorical features
  res = use_label_encoder(res)
  res = pd.get_dummies(res)

  # correct anomalies on days
  correct_anomalies_on_days(res, get_features_anomalies_on_days(res))
  res = correct_negative_days(res)
  return res

In [0]:
'''
Method that merges the dataset passed in input to the
application.csv dataset through the SK_ID_CURR
@param df_application : the application.csv dataset 
@param df_to_merge : the dataset to merge
@param suffix : the suffix to join to new merged features names
@param prefix : the prefix to join to new merged features names
@return the dataset merged
'''
def merge_file_to_application(df_application,df_to_merge,suffix,prefix):
  #count how many samples are correlated to the primary key and merge them
  grp = df_to_merge[['SK_ID_CURR','SK_ID_'+suffix]].groupby(by=['SK_ID_CURR'])['SK_ID_'+suffix].count().reset_index().rename(columns={'SK_ID_'+suffix: prefix+'_COUNT'})
  df_application = df_application.merge(grp, on =['SK_ID_CURR'], how = 'left')
  #fill NaN with 0
  df_application[prefix+'_COUNT'] = df_application[prefix+'_COUNT'].fillna(0)

  #merge the numerical features using the mean
  grp = df_to_merge.drop('SK_ID_'+suffix, axis =1).groupby(by=['SK_ID_CURR']).mean().reset_index()
  grp.columns = change_column_name(grp.columns,prefix)
  df_application = df_application.merge(grp, on =['SK_ID_CURR'], how = 'left')
  df_application.update(df_application[grp.columns].fillna(0))
  return df_application

'''
Method that change the column names to the columns list passed in input
@param columns : list of columns to change
@param prefix : the prefix to join to the columns names passed in input
@return a list of columns changed
'''
def change_column_name(columns, prefix):
    result = []
    for c in columns:
      if c != 'SK_ID_CURR' and c != 'SK_ID_BUREAU':
        result.append(prefix +'_'+ c)
      else:
        result.append(c)
    return result

In [0]:
'''
Method that align the shape of the training and test data
@param training_data : the dataset of train 
@param test_data : the dataset of test
@return the both datasets aligned
'''
def align_train_test_sets(training_data,test_data):
    train_labels = training_data['TARGET']

    prev_cols_train = training_data.columns
    prev_cols_test = test_data.columns
    training_data, test_data = training_data.align(test_data, join = 'inner', axis = 1)

    training_data['TARGET'] = train_labels

    return training_data,test_data

In [0]:
'''
Method that for each list passed in input, reduce the features inside in 1 pca component and 
replace them whit this latter.
@param train_data : the train data
@param test_data : the test data
@param list_of_list_features : list that contains list of features to reduce
@return the train data and test data reduced
'''
def reduce_with_PCA(train_data,test_data,list_of_list_features):

  train = train_data.copy()
  test = test_data.copy()

  #define imputer and PCA
  imputer = SimpleImputer(strategy = 'median')
  pca = PCA(n_components=1)

  for i,l in enumerate(list_of_list_features):
    print("PCA REDUCTION N°",i)
    print("shape train before reduction -->",train.shape)
    print("shape test before reduction -->",test.shape)
    #remove from train and test the features to reduce
    features_data_train = train[l]
    features_data_test = test[l]
    
    #impute data
    features_data_train = imputer.fit_transform(features_data_train)
    features_data_test = imputer.transform(features_data_test)

    #reduce with PCA data
    features_data_train = pca.fit_transform(features_data_train)
    features_data_test = pca.transform(features_data_test)

    #convert to dataframe
    features_data_train = pd.DataFrame(features_data_train)
    features_data_test = pd.DataFrame(features_data_test)

    #drop columns to the dataset
    train.drop(columns=l,inplace=True)
    test.drop(columns=l,inplace=True)

    #replace the columns dropped with reduced data
    col_name = 'PCA'+str(i)
    train[col_name] = features_data_train
    test[col_name] = features_data_test
    print("shape train after reduction -->",train.shape)
    print("shape test after reduction -->",test.shape)
    print()

  return train, test

In [0]:
'''
Method that plots the heatmap of correlations of features
passed in input.
@param df : the input dataset 
@param list_of_features : the features to calculate correlation
'''
def find_correlations(df,list_of_features):
    ext_data = df[list_of_features]
    ext_data_corrs = ext_data.corr()
    plt.figure(figsize = (15, 10))

    # Heatmap of correlations
    sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
    plt.title('Correlation Heatmap')

# Preprocessing and Merging


In [0]:
# read all csv
app_train = read_csv_and_clean('../input/home-credit-default-risk/application_train.csv')
app_test = read_csv_and_clean('../input/home-credit-default-risk/application_test.csv')
POS_CASH_balance = read_csv_and_clean("../input/home-credit-default-risk/POS_CASH_balance.csv")
bureau = read_csv_and_clean("../input/home-credit-default-risk/bureau.csv")
bureau_balance = read_csv_and_clean("../input/home-credit-default-risk/bureau_balance.csv")
credit_card_balance = read_csv_and_clean("../input/home-credit-default-risk/credit_card_balance.csv")
installments_payments = read_csv_and_clean("../input/home-credit-default-risk/installments_payments.csv")
previous_application = read_csv_and_clean("../input/home-credit-default-risk/previous_application.csv")

reading: ../input/home-credit-default-risk/application_train.csv


KeyboardInterrupt: 

In [0]:
# merge application train and test with previous_application
app_train = merge_file_to_application(app_train,previous_application,"PREV","PREV")

app_test = merge_file_to_application(app_test,previous_application,"PREV","PREV")
del previous_application
gc.collect()

In [0]:
# merge bureau with bureau_balance
bureau_balance = bureau_balance.groupby(by=['SK_ID_BUREAU']).mean().reset_index()
bureau_balance.columns = change_column_name(bureau_balance.columns, "BBALANCE")
bureau = bureau.merge(bureau_balance, on =['SK_ID_BUREAU'], how = 'left')
bureau.update(bureau[bureau_balance.columns].fillna(0))
del bureau_balance
gc.collect()

In [0]:
# merge application train and test with bureau
app_train = merge_file_to_application(app_train,bureau,"BUREAU","BUREAU")

app_test = merge_file_to_application(app_test,bureau,"BUREAU","BUREAU")
del bureau
gc.collect()

In [0]:
# merge application train and test with pos_cash_balace
app_train = merge_file_to_application(app_train,POS_CASH_balance,"PREV", "POS")

app_test = merge_file_to_application(app_test,POS_CASH_balance,"PREV","POS")
del POS_CASH_balance
gc.collect()

In [0]:
# merge application train and test with credit_card_balance
app_train = merge_file_to_application(app_train,credit_card_balance,"PREV","CREDIT")

app_test = merge_file_to_application(app_test,credit_card_balance,"PREV","CREDIT")
del credit_card_balance
gc.collect()

In [0]:
# merge application train and test with instalments_payments
app_train = merge_file_to_application(app_train,installments_payments ,"PREV", "INSTA")

app_test = merge_file_to_application(app_test,installments_payments ,"PREV","INSTA")
del installments_payments
gc.collect()

In [0]:
# align train and test data
app_train, app_test = align_train_test_sets(app_train,app_test)

# Feature Engineering and Selection

In [0]:
'''
Method that modifies some features of the application dataset
@dataset : the application dataset
@return the dataset cleaned by some features
'''
def clean_application(dataset):
  # drop the columns relative to FLAG DOCUMENTS
  flag_documents = [col for col in dataset.columns if col.startswith("FLAG_DOCUMENT")]
  dataset = dataset.drop(columns=flag_documents)
  # add some domain knowledge features
  dataset['CREDIT_INCOME_PERCENT'] = dataset['AMT_CREDIT'] / dataset['AMT_INCOME_TOTAL']
  dataset['ANNUITY_INCOME_PERCENT'] = dataset['AMT_ANNUITY'] / dataset['AMT_INCOME_TOTAL']
  dataset['DAYS_EMPLOYED_PERCENT'] = dataset['DAYS_EMPLOYED'] / dataset['DAYS_BIRTH']
  dataset['INCOME_GT_CREDIT_FLAG'] = dataset['AMT_INCOME_TOTAL'] > dataset['AMT_CREDIT']
  dataset['CREDIT_TERM'] = dataset['AMT_CREDIT'] / dataset['AMT_ANNUITY'] 
  dataset['INCOME_CREDIT_PERC'] = dataset['AMT_INCOME_TOTAL'] / dataset['AMT_CREDIT']
  dataset['INCOME_PER_PERSON'] = dataset['AMT_INCOME_TOTAL'] / dataset['CNT_FAM_MEMBERS']
  dataset['PAYMENT_RATE'] = dataset['AMT_ANNUITY'] / dataset['AMT_CREDIT']
  dataset['YEARS_BIRTH'] = dataset['DAYS_BIRTH'] / 365

  return dataset

app_train = clean_application(app_train)
app_test = clean_application(app_test)


In [0]:
'''
Method used to select a specific number of features to drop starting from the lower correlated features
@param correlations : a dataframe containing the correlations of features with target
@param features_number : the number of the feature to select and add to the drop list
@ return a list with features selected to drop
'''
def drop_last_correlations(correlations, features_number):
  correlations = correlations.reset_index()
  correlations['TARGET'] = correlations['TARGET'].apply(abs)
  correlations.sort_values(by='TARGET',inplace=True)
  column_dropped = correlations[:features_number]['index']
  return column_dropped

In [0]:
'''
THIS PART IS COMMENTED BECAUSE AT THE END WE USE DROP LAST CORRELATION METHOD!

#SEARCH SIMILAR COLUMNS BY CRITERIA AND CHECK IF THEY ARE STRICTLY CORRELATED
cols = [x for x in app_train.columns if 'REGION' in x]
find_correlations(app_train,cols)
'''

In [0]:
'''
THIS PART IS COMMENTED BECAUSE AT THE END WE USE DROP LAST CORRELATION METHOD!

#Some features subset strictly correlated
list_of_features_AVG = ['APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG']
 list_of_features_MEDI = ['APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI']
list_of_features_AMT = ['AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'PREV_AMT_ANNUITY',
 'PREV_AMT_APPLICATION',
 'PREV_AMT_CREDIT',
 'PREV_AMT_DOWN_PAYMENT',
 'PREV_AMT_GOODS_PRICE',
 'BUREAU_AMT_CREDIT_MAX_OVERDUE',
 'BUREAU_AMT_CREDIT_SUM',
 'BUREAU_AMT_CREDIT_SUM_DEBT',
 'BUREAU_AMT_CREDIT_SUM_LIMIT',
 'BUREAU_AMT_CREDIT_SUM_OVERDUE',
 'BUREAU_AMT_ANNUITY',
 'CREDIT_AMT_BALANCE',
 'CREDIT_AMT_CREDIT_LIMIT_ACTUAL',
 'CREDIT_AMT_DRAWINGS_ATM_CURRENT',
 'CREDIT_AMT_DRAWINGS_CURRENT',
 'CREDIT_AMT_DRAWINGS_OTHER_CURRENT',
 'CREDIT_AMT_DRAWINGS_POS_CURRENT',
 'CREDIT_AMT_INST_MIN_REGULARITY',
 'CREDIT_AMT_PAYMENT_CURRENT',
 'CREDIT_AMT_PAYMENT_TOTAL_CURRENT',
 'CREDIT_AMT_RECEIVABLE_PRINCIPAL',
 'CREDIT_AMT_RECIVABLE',
 'CREDIT_AMT_TOTAL_RECEIVABLE',
 'INSTA_AMT_INSTALMENT',
 'INSTA_AMT_PAYMENT']
list_of_features_PREV_DAYS = [
 'DAYS_LAST_PHONE_CHANGE',
 'PREV_DAYS_DECISION',
 'PREV_DAYS_FIRST_DRAWING',
 'PREV_DAYS_FIRST_DUE',
 'PREV_DAYS_LAST_DUE_1ST_VERSION',
 'PREV_DAYS_LAST_DUE',
 'PREV_DAYS_TERMINATION',
  'INSTA_DAYS_INSTALMENT',
 'INSTA_DAYS_ENTRY_PAYMENT']
list_of_features_PREV_ANOM =[
 'PREV_DAYS_FIRST_DRAWING_ANOM',
 'PREV_DAYS_FIRST_DUE_ANOM',
 'PREV_DAYS_LAST_DUE_1ST_VERSION_ANOM',
 'PREV_DAYS_LAST_DUE_ANOM',
 'PREV_DAYS_TERMINATION_ANOM']
list_of_features_BUREAU = [
 'BUREAU_DAYS_CREDIT',
 'BUREAU_DAYS_CREDIT_ENDDATE',
 'BUREAU_DAYS_ENDDATE_FACT',
 'BUREAU_DAYS_CREDIT_UPDATE']
list_of_features_REGION = ['REGION_POPULATION_RELATIVE',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION']

#replace each subset with a PCA
 app_train,app_test = reduce_with_PCA(app_train,app_test,[list_of_features_MEDI,list_of_features_AVG,list_of_features_AMT,
                                                   list_of_features_PREV_DAYS,list_of_features_PREV_ANOM,list_of_features_BUREAU])
'''

In [0]:
# find correlations and sort them
correlations = app_train.corr()['TARGET'].sort_values()
correlations.drop('SK_ID_CURR',inplace=True)

In [0]:
#drop the selected columns
column_dropped = drop_last_correlations(correlations,70)
app_test = app_test.drop(column_dropped, axis=1)
app_train = app_train.drop(column_dropped, axis=1)

# TRAINING AND TESTING

In [0]:
'''
Method used to prepare data to the training.
Convert the dataset as dataframe form in numpy array 
useful for the models, and use some methods of cleaning data
like imputer scaler etc.
@param training_data : the dataset with train data
@param test_data : the dataset with test data
@param use_scaler : flag that indicates if use scaler during preprocessing
@param use_imputer : flag that indicates if use imputer during preprocessing for missing values 
@param use_power_transform_bool : flag that indicates if use power transform on data during preprocessing and make data more Gaussian like

@param scaler_type : type of scaler to use. Two options : 'minmax between 0-1' and 'StandardScaler'
@return  the training data, labels, test data as numpy array, the keys of the test clients and the features
'''
def prepare_data_for_training_and_test(training_data,
                                       test_data,
                                       use_scaler = True, 
                                       use_imputer = True,
                                       use_power_transform_bool = False,
                                       scaler_type = "minmax"):
  
  #save the important columns like the keys and the targets
  train_labels = training_data['TARGET']
  train = training_data.drop(columns = ['TARGET','SK_ID_CURR'])
  test_keys = test_data[['SK_ID_CURR']]
  test = test_data.drop(columns = ['SK_ID_CURR'])
  features = list(train.columns)

  imputer = SimpleImputer(strategy = 'median')

  if use_imputer:
    print("use imputer")
    imputer.fit(train)
    train = imputer.transform(train)
    test = imputer.transform(test)
    print("done")

  if use_scaler:
    print("use scaler")
    if scaler_type == 'minmax':
      scaler = MinMaxScaler(feature_range = (0, 1))
    elif scaler_type == 'standard':
      scaler = StandardScaler()
    scaler.fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    print("done")

  if use_power_transform_bool:
    print("use power transform")
    train = power_transform(train, method='yeo-johnson')
    test = power_transform(test, method='yeo-johnson')
    print("done")

  return train, train_labels, test, test_keys, features


#call the method for preprocessed data                    
train_p, train_labels_p, test_p,test_keys_p, features_p = prepare_data_for_training_and_test(app_train, app_test,use_scaler = True, 
                                                                                                        use_imputer = True,
                                                                                                        use_power_transform_bool = False,
                                                                                                       scaler_type='minmax')
#call method for no preprocessed data
train, train_labels, test,test_keys, features = prepare_data_for_training_and_test(app_train, app_test,use_imputer = False,use_power_transform_bool = False)

In [0]:
'''
Method used to do prediction probabilities on data passed in input using a specific
trained model
@param model : the trained model
@param test_data : the test data
@param k_id_curr_test : the client keys of the test data
@param file_csv : the name of the file.csv where save the submit prediction
@param type_pred : the type of prediction to do. Two options: 'deep' for deep learning
models and 'no_deep' for other models
'''
def make_prediction_and_submit(model,test_data,sk_id_curr_test,file_csv,type_pred = 'no_deep'):
    if(type_pred == 'deep'):
      predictions = model.predict_proba(test_data)
    elif(type_pred == 'no_deep'):
      predictions = model.predict_proba(test_data)[:, 1]

    submit = sk_id_curr_test
    submit['TARGET'] = predictions
    submit.to_csv(file_csv, index = False)
    print(file_csv + " created!")
    
    del model
    gc.collect()

In [0]:
'''
Method that train and predicts the respectively training data and test data passed in input.
Some model use the k-fold and it is possible to choice the type of k-fold. It's possible
also use the SMOTE oversampling to add instances of minority class during each k-fold iteration
@param model_type :  it is possible to choice between 5 models using the correct string --> 3 without kfold {linear_regression,random_forest,mlp} and 2 with kfold {lgbm, xgb}
@param training_data : the data to train, relative to the X
@param training_labels : the correspective Y of the training data
@param test_data : the data to test
@param sk_id_curr_test : the keys of the clients of the test data
@param num_folds : if is used kfold algorithm, indicates the number of folds
@param file_csv : the file.csv where submit the predicted results
@param stratified : if selected, use the Stratified KFold, else the Normal KFold
@param use_over_sampling : flag that indicates if use over sampling of minority class during preprocessing
'''
def train_model(model_type,training_data,train_labels,test_data,sk_id_curr_test ,file_csv,num_folds = 5, stratified = False,use_over_sampling = False):
    
    print("Train with "+model_type)

    pred_proba = np.zeros(test_data.shape[0])
    
    #model used without KFOLD
    if model_type == "linear_regression":
      model = LogisticRegression(C = 0.001, random_state=42)
      model.fit(training_data, train_labels)
      pred_proba += model.predict_proba(test_data)[:, 1]

    elif model_type == "random_forest":
      model = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, 
                                     max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
                                      min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=-1, random_state=42, verbose=1,
                                      warm_start=False, class_weight=None)
      model.fit(training_data, train_labels)
      pred_proba += model.predict_proba(test_data)[:, 1]
    
    elif model_type == "mlp":
      model = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant',
                learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=42, tol=0.0001, verbose=False, warm_start=False,
                momentum=0.9, nesterovs_momentum=True, early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10)
      model.fit(training_data, train_labels)
      pred_proba += model.predict_proba(test_data)[:, 1]
      

    #MODEL USED WITH KFOLD
    else:
      print("Train with {} Folds... \nTrain shape: {}, test shape: {}".format(num_folds,training_data.shape, test_data.shape))
      # Select type of KFOLD
      if stratified:
          kfold = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
      else:
          kfold = KFold(n_splits= num_folds, shuffle=True, random_state=1001)

      #repeat the process for each fold
      for n_fold, (train_idx, valid_idx) in enumerate(kfold.split(training_data, train_labels)):
        if(type(training_data) == pd.DataFrame):
          train_x, train_y = training_data.iloc[train_idx], train_labels.iloc[train_idx]
          valid_x, valid_y = training_data.iloc[valid_idx], train_labels.iloc[valid_idx]
        else:
          train_x, train_y = training_data[train_idx], train_labels[train_idx]
          valid_x, valid_y = training_data[valid_idx], train_labels[valid_idx]

        #use SMOTE is Flag is true
        if use_over_sampling :
          print("oversampling")
          sm = SVMSMOTE(random_state=42)
          train_x, train_y = sm.fit_resample(train_x, train_y)
          print("done")

        #select model between lgbm and xgb
        if model_type == "lgbm" :
          model = LGBMClassifier(
              nthread=4,
              n_estimators=200,
              learning_rate=0.08,
              num_leaves=256,
              colsample_bytree=0.9497036,
              subsample=0.8715623,
              max_depth=10,
              reg_alpha=0.041545473,
              reg_lambda=0.0735294,
              min_split_gain=0.0222415,
              min_child_weight=39.3259775,
              silent=-1,
              verbose=-1)
          
          model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 20, early_stopping_rounds= 20)
          pred_proba += model.predict_proba(test_data, num_iteration=model.best_iteration_)[:, 1] / kfold.n_splits

        elif model_type == "xgb":
        ## objective = 'binary:logistic'
          model = XGBClassifier(learning_rate=0.2, n_estimators=200,
            max_depth=4,num_leaves=8, min_child_weight=39.3259775, subsample=0.87156238, colsample_by_tree=0.9497036,
            objective= 'binary:logistic',
            nthread=4)
          model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=20, eval_metric='auc',verbose=20)
          pred_proba += model.predict_proba(test_data,ntree_limit=model.get_booster().best_iteration)[:, 1] / kfold.n_splits

        #Free the memory
        del train_x, train_y, valid_x, valid_y
        gc.collect()


  #submit the produced data
    submit = sk_id_curr_test
    submit['TARGET'] = pred_proba
    submit.to_csv(file_csv, index = False)
    print(file_csv + " created")
    del model

# call the method
num_folds = 5

train_model("lgbm",train_p,train_labels_p,test_p,test_keys_p,"k_fold_lgbm_p.csv",num_folds, stratified = False)
train_model("linear_regression",train_p,train_labels_p,test_p,test_keys_p,"k_fold_linear_regression_p.csv",num_folds, stratified = False)
train_model("random_forest",train_p,train_labels_p,test_p,test_keys_p,"k_fold_random_forest_p.csv",num_folds, stratified = False)
train_model("mlp",train_p,train_labels_p,test_p,test_keys_p,"k_fold_mlp_p.csv",num_folds, stratified = False)
train_model("xgb",train_p,train_labels_p,test_p,test_keys_p,"k_fold_xgb_p.csv",num_folds, stratified = False)

Train with mlp
Train with 5 Folds... 
Train shape: (307511, 406), test shape: (48744, 406)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


k_fold_mlp_p.csv created


In [0]:
# split data into train_set and eval_set
train_data_p, eval_data_p, train_labels_p, eval_labels_p = train_test_split(
    train_p, train_labels_p, test_size=0.15, random_state=None)

train_data, eval_data, train_labels, eval_labels = train_test_split(
    train, train_labels, test_size=0.15, random_state=None)

ValueError: Found input variables with inconsistent numbers of samples: [307511, 261384]

In [0]:
#TRY TO TRAIN WITHOUT THE KFOLD
#PARAMETERS OF LIGHTGBM
#WE HAVE USED PRINCIPALLY THIS MODEL TO DO HYPERTUNING OF PARAMETERS
boosting_type =['gbdt','rf','dart','goss']
def LGBM():
  model_gbm = lgb.LGBMClassifier(boosting_type=boosting_type[3],
                                 class_weight='balanced',
                                 colsample_bytree=1.0, 
                                 importance_type='split',
                                 learning_rate=0.09,
                                 max_depth=10,
                                 min_child_samples=60, 
                                 min_child_weight=0.001,
                                 min_data=100,
                                 min_split_gain=0.0, 
                                 n_estimators=500,
                                 n_jobs=-1,
                                 num_leaves=256,
                                 objective=None, 
                                 random_state=42, 
                                 silent=True, 
                                 subsample=1.0,
                                 subsample_for_bin=200000, 
                                 subsample_freq=0)
  return model_gbm
  
#TRAIN LGBM
model_gbm = LGBM()
print('Training with data no preprocessed')
model_gbm.fit(train_data, train_labels,eval_set=(eval_data,eval_labels),
              verbose=10,eval_metric='auc',early_stopping_rounds=20)
make_prediction_and_submit(model_gbm, test, test_keys, 'lightgbm_no_processed.csv')

print()

#train with preprocessed data
model_gbm = LGBM()
print('Training with data preprocessed')
model_gbm.fit(train_data_p, train_labels_p,eval_set=(eval_data_p,eval_labels_p),
              verbose=10,eval_metric='auc',early_stopping_rounds=20)
make_prediction_and_submit(model_gbm, test_p, test_keys_p, 'lightgbm_processed.csv')


Training with data no preprocessed
Training until validation scores don't improve for 20 rounds
[10]	valid_0's auc: 0.752434	valid_0's binary_logloss: 0.595099
[20]	valid_0's auc: 0.761925	valid_0's binary_logloss: 0.563143
[30]	valid_0's auc: 0.769135	valid_0's binary_logloss: 0.542886
[40]	valid_0's auc: 0.77241	valid_0's binary_logloss: 0.528962
[50]	valid_0's auc: 0.774414	valid_0's binary_logloss: 0.51779
[60]	valid_0's auc: 0.775294	valid_0's binary_logloss: 0.509003
[70]	valid_0's auc: 0.776679	valid_0's binary_logloss: 0.501496
[80]	valid_0's auc: 0.777337	valid_0's binary_logloss: 0.495729
[90]	valid_0's auc: 0.777283	valid_0's binary_logloss: 0.490169
[100]	valid_0's auc: 0.77781	valid_0's binary_logloss: 0.485224
[110]	valid_0's auc: 0.777542	valid_0's binary_logloss: 0.48094
[120]	valid_0's auc: 0.77776	valid_0's binary_logloss: 0.475617
Early stopping, best iteration is:
[100]	valid_0's auc: 0.77781	valid_0's binary_logloss: 0.485224


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


lightgbm_no_processed.csv created!

Training with data preprocessed
Training until validation scores don't improve for 20 rounds
[10]	valid_0's auc: 0.742005	valid_0's binary_logloss: 0.597565
[20]	valid_0's auc: 0.753862	valid_0's binary_logloss: 0.56512
[30]	valid_0's auc: 0.760458	valid_0's binary_logloss: 0.545688
[40]	valid_0's auc: 0.764788	valid_0's binary_logloss: 0.530918
[50]	valid_0's auc: 0.769028	valid_0's binary_logloss: 0.518814
[60]	valid_0's auc: 0.769849	valid_0's binary_logloss: 0.50994
[70]	valid_0's auc: 0.771943	valid_0's binary_logloss: 0.501739
[80]	valid_0's auc: 0.772574	valid_0's binary_logloss: 0.495995
[90]	valid_0's auc: 0.773298	valid_0's binary_logloss: 0.490842
[100]	valid_0's auc: 0.773237	valid_0's binary_logloss: 0.485436
[110]	valid_0's auc: 0.77344	valid_0's binary_logloss: 0.480994
[120]	valid_0's auc: 0.773521	valid_0's binary_logloss: 0.476375
[130]	valid_0's auc: 0.774154	valid_0's binary_logloss: 0.472034
[140]	valid_0's auc: 0.774268	valid_0'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


lightgbm_processed.csv created!


In [0]:
#TRY TO USE DEEP LEARNING WITH SEQUENTIAL KERAS MODEL AND VARIOUS HIDDEN LAYER
#DROPOUT IS USED TO AVOID OVERFITTING

#define some parameters
input_size = train_data_p.shape[1]
first_units = int(input_size/2)
second_units = int(first_units/2)

model = Sequential()
#First Hidden Layer
model.add(Dense(first_units, activation='relu', kernel_initializer='random_normal', input_dim=input_size))
model.add(Dropout(0.2))
#Second  Hidden Layer
model.add(Dense(second_units, activation='relu', kernel_initializer='random_normal'))
#Output Layer
model.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

#compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

#parameters of training
batch_size = [32,64,128,256,512]
epochs = 20
early_stopping = EarlyStopping(monitor='val_acc', min_delta=0, patience=2, verbose=1, mode='auto', baseline=None)

#train the model
history = model.fit(train_data_p, train_labels_p, epochs=epochs, batch_size=batch_size[4],validation_data=(eval_data_p,eval_labels_p),callbacks=[early_stopping])

#predict model and submit
make_prediction_and_submit(model,test_p,test_keys_p,'NN_predictions.csv',type_pred='deep')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 203)               82621     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 204       
Total params: 82,825
Trainable params: 82,825
Non-trainable params: 0
_________________________________________________________________
Train on 261384 samples, validate on 46127 samples
Epoch 1/20
Epoch 2/20
 15872/261384 [>.............................] - ETA: 2s - loss: 0.2399 - accuracy: 0.9221



Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


NN_predictions.csv created!
