In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
import gc

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
pd.set_option('use_inf_as_na', True) # treating infs!!!!!
# Any results you write to the current directory are saved as output.


In [None]:
I = [[8, 6, 2, 7], [6, 2, 4, 1], [5, 8, 5, 2], [3, 0, 3, 2]]
K = [[4, 3], [7, 2]]
S = [[2 ,7], [3, 4]]
m = 4
k = 2

from scipy.signal import convolve2d
print(convolve2d(I,K, mode='valid'))

In [None]:
def mean_encode_categorical(df, target_col):
    """Mean encode categorical features with epanding mean. nans will be replaced with mean of column. df: pd dataframe, target_col: target column to get values from"""
    
    df_with_mean = df.copy()
    for column in df.columns:
        if (df[column].nunique() <= 60) & (column != target_col): # only features with less than 60 unique values will be encoded
            gb_col = df.groupby([column]) # groupby categorical feature
            cumsum = gb_col[target_col].cumsum() - df[target_col] # sum target var uptill now (without this row)
            df_with_mean[column] = cumsum / gb_col.cumcount() # mean encode feature
            df_with_mean[column].fillna(df_with_mean[column].mean(), inplace=True) # fill nans with means
            
            
    return df_with_mean

def test_mean_encode_categorical(df_test, df_train, target_col):
    """Mean encode categorical features with epanding mean. nans will be replaced with mean of column. df: pd dataframe, target_col: target column to get values from"""
    
    df_with_mean = df_test.copy()
    for column in df_test.columns:
        if (df_test[column].nunique() <= 60) & (column != target_col): # only features with less than 60 unique values will be encoded
            gb_col = df_train.groupby([column]) # groupby categorical feature
            means = gb_col[target_col].mean()
            df_with_mean[column] = df_with_mean[column].map(means)
            df_with_mean[column].fillna(df_with_mean[column].mean(), inplace=True) 
            
    return df_with_mean

def print_outliers(df):
    "Print all numeric columns in boxplots. allows to see outliers. Dependencies: matplotlib.pyplot as plt, seaborn as sns"
    
    reduce_df = df.select_dtypes(np.number)
    num_plots = len(reduce_df.columns)
    num_rows = num_plots / 4 + 1
    plt.figure(figsize=(20,10 * num_rows)) # define the figure
    
    for i in range (num_plots): # print each numric col
        plt.subplot(num_rows, 4, i + 1)
        sns.boxplot(reduce_df[reduce_df.columns[i]])
    
    plt.show()

def print_null_places(df):
    """Print two graphs that shows how nans are placed across the data"""
    
    plt.figure(figsize=(20,10))
    plt.subplot(1,2,1)
    plt.plot(df.isnull().sum(axis=0)) # columnwise nulls
    plt.title("columnwise nulls")

    plt.subplot(1,2,2)
    plt.plot(df.isnull().sum(axis=1), ) # rowwise nulls
    plt.title("rowwise nulls")
    
    plt.show()

def replace_numeric_columns_nulls(df):
    """Replaces dataframe numeric column nulls with columns means / 0 if there is no mean"""
    
    numeric_cols = df.select_dtypes(np.number).columns
    filled_df = df.copy()
    for col in numeric_cols:
        filled_df[col] = filled_df[col].fillna(filled_df[col].mean())
    filled_df = filled_df.fillna(0)
    return filled_df

def get_rows_above_threshold(df, column, threshold):
    """Gets rows from a dataframe which columns values are bigger than threshold"""
    
    return df.loc[df[column] > threshold, column]

def get_correlated_features(df, feature, threshold_corr):
    """Gets features with pearson correlation greater than threshold. args --> df: pd dataframe, feature: feature name to get correlated features for, threshold_corr: correlation threshold"""
    
    corr_feats = []
    for feat in df.columns:
        if (abs(df[feature].corr(df[feat])) > threshold_corr) & (feat != feature):
            corr_feats.append(feat)
    
    return corr_feats

def get_correlation_graph(df, threshold):
    """Creats an undirectd graph of feature correlation. If two features correlation > threshold they will be connected with an edge"""
    
    corr_graph = Graph()
    
    for feat in df.columns:
        if not corr_graph.has_node(feat):
            corr_feats = get_correlated_features(df, feat, threshold)
            for corr_feat in corr_feats:
                corr_graph.add_edge((feat, corr_feat))
    
    return corr_graph

class Graph():
    """Undireced graph class"""
    
    def __init__(self):
        self._dict = {}
    
    def add_node(self, node):
        if node not in self._dict:
            self._dict[node] = set()
            
    def add_edge(self, edge):
        """Adds an edge. If node didn't exist in the graph, adds it."""
        
        (node1, node2) = edge
        if node1 not in self._dict:
            self._dict[node1] = set([node2])
        else:
            self._dict[node1].add(node2)
        if node2 not in self._dict:
            self._dict[node2] = set([node1])
        else:
            self._dict[node2].add(node1)
    
    def has_edge(self, edge):
        (node1, node2) = edge
        return (node2 in self._dict[node1])
    
    def has_node(self, node):
        return node in self._dict
    
    def get_edges(self, node):
        if self.has_node(node):
            return self._dict[node]
        else:
            return None
    
    def print(self):
        print(self._dict)
    
def get_uncorrolated_features(df, threshold):
    """Gets all the features whic correlation is smaller than threshold.
    Basiclly it returns [feature if feature_correlation_with_all_other_features < threshold]. 
    All features have to be numerical of encoded"""
    
    feats = list(df.columns)
    corr_graph = get_correlation_graph(df, threshold)
    un_corr_feats = []
    
    for feat in feats:
        un_corr_feats.append(feat)
        corr_feats = corr_graph.get_edges(feat)
        if(corr_feats):
            for to_del in corr_feats:
                if to_del in feats:
                    feats.remove(to_del)
    
    return un_corr_feats
  
    
def lgbm_eval(df,target_col, n_splits=1, test_size=0.20, verbose=False, get_model=False, n_rounds=200
              , lgb_params={
               'feature_fraction': 0.75,
               'metric': 'auc',
               'nthread':4, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.01, 
               'objective': 'binary', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0,
               'early_stopping_rounds': 100
    }):
    
    """Evaluates default parameters lgbm model's auc, uses stratified shuffle split. args --> df: pd dataframe, n_splits: int number of data splits,
    test_size: each splits test data precentage, verbose: bool verbosity""" 
    
    import lightgbm as lgb
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import roc_auc_score
    
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size)
    X_train = df.drop([target_col], axis=1)
    Y_train = df[target_col]
    
    loss = 0
    
    for train_indices, test_indices in sss.split(X_train,Y_train):
        X_train_enc = mean_encode_categorical(df.iloc[train_indices], target_col).drop([target_col], axis=1) # mean encode train set
        X_test_enc = test_mean_encode_categorical(df.iloc[test_indices].drop([target_col], axis=1), df.iloc[train_indices], target_col) # mean encode test set
        model = lgb.train(lgb_params, lgb.Dataset(X_train_enc, label=Y_train.iloc[train_indices]), n_rounds, valid_sets=lgb.Dataset(X_test_enc, Y_train.iloc[test_indices]),
                          verbose_eval=verbose) # train model
        loss += roc_auc_score(Y_train.iloc[test_indices], model.predict(X_test_enc)) # accumulate loss
        
         # free up spaxe
        del X_train_enc, X_test_enc
        gc.collect()
        
    if get_model:
        return (loss / sss.get_n_splits(), model) # avarage loss, model
    else:
        return loss / sss.get_n_splits() # loss
    

def print_feature_correlation(df):
    """Print feature correlation"""
    
    plt.figure(figsize=(30,20))
    sns.heatmap(df.corr())
    plt.show()
    

def normalize_dataframe(df):
    """Normalize dateframe numeric columns. Will not normalize binary columns. Dataframe has to be numeric""" 
    
    df_norm = df.copy()
    for column in df.columns:
        if (list(df[column].unique()) != [0,1]) & (list(df[column].unique()) != [1,0]): # column in not part of one-hot-vector
            c_range = df[column].max() - df[column].min() # range of column
            df_norm[column] = (df[column] - df[column].mean()) / c_range
    
    return df_norm


def explore_target(df, target_column):
    """Print target balance and distirbution"""
    
    plt.figure(figsize=(30,60))
    # target balnce
    plt.subplot(2,1,1)
    plt.hist(df[target_column])
    plt.title("Target balance:")
    
    # target distirbution
    plt.subplot(2,1,2)
    sns.stripplot(data= df, x="TARGET",y=range(len(df)))
    plt.title("Target distirbution")


def nn_classifier(input_shape):
    """Build a nural network classifier"""
    
    from keras.models import Sequential
    from keras.layers import InputLayer, Dense, Dropout
    import keras
    model = Sequential()
    model.add(InputLayer(input_shape=input_shape))
    model.add(Dense(32, activation='relu', kernel_initializer='random_normal'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

    model.compile(optimizer='adam', 
                  loss=keras.losses.BinaryCrossentropy(from_logits=False), metrics=["accuracy"])
    return model

class Data():
    """An high level data class.
    
    Supports getting train / test set and encoding
    """
    
    def __init__(self, train_set, test_set, target_column, unique_coulmn):
        """Init a data structure.
        
        Positional arguments:
        train_set -- pandas dataframe for training
        test_set -- pandas dataframe for testing
        target_column -- target column of train set
        unique_column -- the identifier coloumn of the data ""ID""
        """
        
        self._train_set = train_set
        self._test_set = test_set
        self._target = target_column
        self._id = unique_column
    
    def get_all(self):
        """Returns all the data without target column"""
        
        return self._train_set.drop([self._target], axis=1).append(self._test_set).reset_index(drop=True)
    
    def get_train(self):
        """Returns train set with target column"""
        
        return self._train_set.copy()
    
    def get_test(self):
        """Return test set"""
        
        return self._test_set.copy()
    
    def get_train_enc(self):
        """Returns train set mean encoded with target"""
        
        return mean_encode_categorical(self._train_set, self._target)
    
    def get_test_enc(self):
        """Return test set mean encoded"""
        
        return test_mean_encode_categorical(self._test_set, self._train_set, self._target)
    
    def get_all_enc(self):
        """Returns all data mean encoded"""
        
        return self.get_train_enc().drop([self._target], axis=1).append(self.get_test_enc()).reset_index(drop=True)
    
    def add_features(self, df_with_features):
        """Adds features from dataframe df to our data. df has to have the unique column"""
        
        self._train_set = self._train_set.join(df_with_features, on=self._id)
        self._test_set = self._test_set.join(df_with_features, on=self._id)

def change_aggragate_column_names(aggs, prefix):
    """Changes aggragates columns names from tuples to strings."""
    
    aggs.columns = pd.Index([prefix+ "_" + e[0] + "_" + e[1].upper() for e in aggs.columns.tolist()])
    
def get_numeric_columns_aggragates(df, gp_column, prefix, to_agg=['min', 'max', 'mean', 'sum', 'var']):
    """Returns numerical columns aggragated features for dataframe grouped by gb_column"""
    
    numeric_columns = df.select_dtypes(np.number).columns # get numeric columns
    if gp_column in numeric_columns:
        numeric_columns = numeric_columns.drop([gp_column])
    aggs = {column: to_agg for column in numeric_columns}
    gp_aggs = df.groupby([gp_column]).agg(aggs)
    change_aggragate_column_names(gp_aggs, prefix) # tidy up columns names
    return gp_aggs

def get_bayesian_hp_space():
    """Returns a dictionary with """
    from hyperopt import hp
    space = {
    'boosting_type': hp.choice('boosting_type', 
                                            [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                             {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                             {'boosting_type': 'goss', 'subsample': 1.0}]),
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'is_unbalance': hp.choice('is_unbalance', [True, False]),
    }
    return space


# Load data

In [None]:
applications_org = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
bureau = pd.read_csv("/kaggle/input/home-credit-default-risk/bureau.csv")
previous_app = pd.read_csv("/kaggle/input/home-credit-default-risk/previous_application.csv")
app_test = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")
bureau_balance = pd.read_csv("/kaggle/input/home-credit-default-risk/bureau_balance.csv")
credit_card_balance = pd.read_csv("/kaggle/input/home-credit-default-risk/credit_card_balance.csv")
installment_payments = pd.read_csv("/kaggle/input/home-credit-default-risk/installments_payments.csv")
pos_cash = pd.read_csv("/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv")

In [None]:
explore_target(applications_org, "TARGET")

In [None]:
print((applications_org["TARGET"] == 1).sum() / len(applications_org)) # TARGET balance

# Merge train and test for feature generation

In [None]:
applications = applications_org.drop(["TARGET"], axis=1)
applications = applications.append(app_test).reset_index(drop=True)

# Utelizing applications for features

In [None]:
applications["PAYMENT_RATE"] = applications["AMT_ANNUITY"] / applications["AMT_CREDIT"]
applications["RELATIVE_LOAN"] = applications["AMT_CREDIT"] / applications["AMT_INCOME_TOTAL"]
applications["PER_PERSON_INCOME"] = applications["AMT_INCOME_TOTAL"] / applications["CNT_FAM_MEMBERS"]
applications["ANNUITY_INCOME_PERC"] = applications["AMT_ANNUITY"] / applications["AMT_INCOME_TOTAL"]
applications["DAYS_EMPLOYED_PERC"] = applications["DAYS_EMPLOYED"] / applications["DAYS_BIRTH"]

# Utelizing bureau table for features

In [None]:
## Create older loans status feature (num open / closed)
bureau["OPEN_CREDITS"] = bureau["CREDIT_ACTIVE"].map({'Active': 1, 'Closed': 0})
credit_active_per_id = bureau.groupby(["SK_ID_CURR"])['OPEN_CREDITS'].sum() # sum open credits
bureau["CLOSE_CREDITS"] = bureau["CREDIT_ACTIVE"].map({'Active': 0, 'Closed': 1})
credit_closed_per_id = bureau.groupby(["SK_ID_CURR"])['CLOSE_CREDITS'].sum() # sum closed credits
# merge
applications = applications.join(credit_active_per_id, on="SK_ID_CURR")
applications = applications.join(credit_closed_per_id, on="SK_ID_CURR")


## Create monthly balance features
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
bureau_balance_gp_agg = bureau_balance.groupby(["SK_ID_BUREAU"]).agg(bb_aggregations)
bureau_balance_gp_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bureau_balance_gp_agg.columns.tolist()])
bureau = bureau.join(bureau_balance_gp_agg, on="SK_ID_BUREAU")

## Create numeric aggs features
num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
bureau_gp_agg = bureau.groupby("SK_ID_CURR").agg(num_aggregations)
bureau_gp_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bureau_gp_agg.columns.tolist()])
applications = applications.join(bureau_gp_agg, on="SK_ID_CURR")

# Utelizing previous applicaions table for features

In [None]:
## Create previous home credit loans status feature
previous_app["PREV_HC_APPROVED"] = previous_app["NAME_CONTRACT_STATUS"].map({'Approved': 1, 'Refused': 0, 'Canceled': 0 , 'Unused ofer': 0})
previous_app["PREV_HC_REFUSED"] = previous_app["NAME_CONTRACT_STATUS"].map({'Approved': 0, 'Refused': 1, 'Canceled': 0 , 'Unused ofer': 0})
previous_app["PREV_HC_CANCELED"] = previous_app["NAME_CONTRACT_STATUS"].map({'Approved': 0, 'Refused': 0, 'Canceled': 1 , 'Unused ofer': 0})
previous_app["PREV_HC_UNUSED"] = previous_app["NAME_CONTRACT_STATUS"].map({'Approved': 1, 'Refused': 0, 'Canceled': 0 , 'Unused ofer': 1})

prev_hc_approved = previous_app.groupby(["SK_ID_CURR"])["PREV_HC_APPROVED"].sum()
prev_hc_refused = previous_app.groupby(["SK_ID_CURR"])["PREV_HC_REFUSED"].sum()
prev_hc_canceled = previous_app.groupby(["SK_ID_CURR"])["PREV_HC_CANCELED"].sum()
prev_hc_unused = previous_app.groupby(["SK_ID_CURR"])["PREV_HC_UNUSED"].sum()

#merge
applications = applications.join(prev_hc_approved, on='SK_ID_CURR')
applications = applications.join(prev_hc_refused, on='SK_ID_CURR')
applications = applications.join(prev_hc_canceled, on='SK_ID_CURR')
applications = applications.join(prev_hc_unused, on='SK_ID_CURR')
    
## Create previous hc agg features
previous_app['APP_CREDIT_PERC'] = previous_app['AMT_APPLICATION'] / previous_app['AMT_CREDIT']
num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
prev_app_aggs = previous_app.groupby(["SK_ID_CURR"]).agg(num_aggregations)
prev_app_aggs.columns = pd.Index(["PREV_"+e[0] + "_" + e[1].upper() for e in prev_app_aggs.columns.tolist()])
applications = applications.join(prev_app_aggs, on="SK_ID_CURR")

In [None]:
## Create previous home credit loans rejection reason
rej_reason = previous_app["CODE_REJECT_REASON"].unique()

for reason in rej_reason:
    # map that reason to ones and others to 0
    reason_map = previous_app["CODE_REJECT_REASON"].map(lambda res: 1 if (res == reason) else 0)
    previous_app["PREV_REASON_REJ_{}_COUNT".format(reason)] = reason_map
    # sum that reason apearance through ids
    reason_sum = previous_app.groupby(['SK_ID_CURR'])["PREV_REASON_REJ_{}_COUNT".format(reason)].sum()
    #merge
    applications = applications.join(reason_sum, on='SK_ID_CURR')

# Utelizing credit card balance for features

In [None]:
credit_card_balance = credit_card_balance.drop(["SK_ID_PREV"], axis=1)
cc_aggs = get_numeric_columns_aggragates(credit_card_balance, "SK_ID_CURR", "CC")
applications = applications.join(cc_aggs, on="SK_ID_CURR")

# Get depth 2 features

In [None]:
ip_prev_sk_id_aggs = get_numeric_columns_aggragates(installment_payments[["SK_ID_PREV", "DAYS_INSTALMENT",  "AMT_PAYMENT", "AMT_INSTALMENT"]]
                                                   , "SK_ID_PREV", "IP_GP_PREV")
installment_payments = installment_payments.join(ip_prev_sk_id_aggs, on="SK_ID_CURR")

pos_cash_prev_sk_id_aggs = get_numeric_columns_aggragates(pos_cash[["SK_ID_PREV", "MONTHS_BALANCE", "CNT_INSTALMENT", "CNT_INSTALMENT_FUTURE"]], "SK_ID_PREV", "POS_CASH_GP_PREV")
pos_cash = pos_cash.join(pos_cash_prev_sk_id_aggs, on="SK_ID_PREV")

# Utelizing installment payment for features

In [None]:
installment_payments = installment_payments.drop(["SK_ID_PREV"], axis=1)
ip_aggs = get_numeric_columns_aggragates(installment_payments, "SK_ID_CURR", "IP")
applications = applications.join(ip_aggs, on="SK_ID_CURR") 

# Utelize POS CASH for features

In [None]:
pos_cash = pos_cash.drop(["SK_ID_PREV"], axis=1)
pos_cash_aggs = get_numeric_columns_aggragates(pos_cash, "SK_ID_CURR", "POS")
applications = applications.join(pos_cash_aggs, on="SK_ID_CURR") 

In [None]:
applications = applications.fillna(0)

In [None]:
# mean encode train applications with expanding mean
train_app = applications[:len(applications_org)]
train_app["TARGET"] = applications_org["TARGET"]
#train_app_enc = mean_encode_categorical(train_app, "TARGET")
#train_app_enc = train_app_enc.drop(["TARGET"], axis=1)

# mean encode test applications with train whole mean
test_app = applications[len(applications_org):]
test_app_enc = test_mean_encode_categorical(test_app, train_app, "TARGET")

# append train and test again
#applications_enc = train_app_enc.append(test_app_enc).reset_index()
#applications_enc = applications_enc.drop(["index"], axis=1)

In [None]:
#un_corr_feats = get_uncorrolated_features(applications_enc, 0.9)
un_corr_feats = pd.read_csv("/kaggle/input/uncorrelated-features/uncorr_feats.csv")["0"].tolist() # load from previous sessions
print(len(un_corr_feats))
#print_feature_correlation(applications_enc[un_corr_feats])

In [None]:
#pd.DataFrame(un_corr_feats).to_csv("/kaggle/working/uncorr_feats.csv") # saving feats

In [None]:
print(lgbm_eval(train_app[un_corr_feats], target_col="TARGET", n_splits=1, test_size=0.2, verbose=True, n_rounds=6000))

# Use bayesian search for hyper parameter optimization

In [None]:
import hyperopt as hp
from hyperopt import tpe, Trials, fmin
from hyperopt import STATUS_OK

space = get_bayesian_hp_space()
trials = Trials()

def objective(params): 
    subsample = params['boosting_type'].get('subsample', 1.0) # make subsample a top-level key
    params['subsample'] = subsample
    params['boosting_type'] = params['boosting_type']['boosting_type'] # make boosting type a top-level key
    params['early_stopping_round'] = 100
    params['metric'] = 'auc'
    params['objective'] = 'binary'
    
    for param in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        params[param] = int(params[param]) # make sure supposed-to-be-int params are int
    
    return {'loss': 1-lgbm_eval(train_app[un_corr_feats].sample(100000), target_col="TARGET", n_splits=2, test_size=0.2,
                                  verbose=False, n_rounds=2000, lgb_params=params), 'hyperparameters': params, 'status': STATUS_OK}


best_params = fmin(fn=objective, algo=tpe.suggest, space=space, trials=trials, max_evals=100)

In [None]:
best_params

In [None]:
#really_un_corr_feats = get_uncorrolated_features(applications_enc, 0.7)
really_un_corr_feats = pd.read_csv("/kaggle/input/uncorrelated-features/really_uncorr_feats.csv")["0"].tolist() # load from previous sessions

In [None]:
#pd.DataFrame(really_un_corr_feats).to_csv("/kaggle/working/really_uncorr_feats.csv") # saving feats#

In [None]:
#U$U$mreally un corr feats
# one hot encode categorical features
app_with_dummies = pd.get_dummies(applications[really_un_corr_feats], prefix_sep='_', drop_first=True)
# get only train set
train_with_dummies = app_with_dummies[:len(applications_org)]

In [None]:
## preproccess data for nn
norm_train = normalize_dataframe(train_with_dummies) 
norm_train = replace_numeric_columns_nulls(norm_train) # fill nulls with avarege

In [None]:
nn = nn_classifier((len(norm_train.columns), )) # build classifier

In [None]:
# train set
X_train = norm_train[:-50000]
Y_train = applications_org["TARGET"][:-50000]
# test set

X_test= norm_train[-50000:]
Y_test = applications_org["TARGET"][-50000:]

In [None]:
hist = nn.fit(X_train, Y_train, batch_size=128, epochs=25, validation_data=(X_test,Y_test), shuffle=True)

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(Y_test, nn.predict(X_test)))

# Get nn classifier features for lgbm

In [None]:
# normalize entire dataframe for nn digestion
norm_app = normalize_dataframe(app_with_dummies)
norm_app = replace_numeric_columns_nulls(norm_app)

In [None]:
train_app["NN_PREDICTION"] = nn.predict(norm_app[:len(applications_org)])
test_app_enc["NN_PREDICTION"] = nn.predict(norm_app[len(applications_org):])

In [None]:
un_corr_feats.append("NN_PREDICTION")

In [None]:
import gc
gc.enable()
del app_with_dummies
gc.collect()

In [None]:
copied_params = {
    'thread':4,
    'n_estimators':10000,
    'learning_rate':0.03,
    'num_leaves':34,
    'colsample_bytree':0.9497036,
    'subsample':0.8715623,
    'max_depth':8,
    'reg_alpha':0.041545473,
    'reg_lambda':0.0735294,
    'min_split_gain':0.0222415,
    'min_child_weight':39.3259775,
    'metric': 'auc',
    'objective': 'binary',
    'early_stopping_rounds':100
}

In [None]:
#best_params

In [None]:
"""best_params['boosting_type'] = 'goss'
for param in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        best_params[param] = int(best_params[param])
best_params['is_unbalance'] = True
best_params['early_stopping_rounds'] = 100
best_params['verbose'] = 0
best_params['metric'] = 'auc'
"""

In [None]:
loss, model = lgbm_eval(train_app[un_corr_feats], target_col="TARGET", n_splits=1, test_size=0.1, verbose=True, get_model=True, n_rounds=6000, lgb_params=copied_params)

In [None]:
test_feats = un_corr_feats.copy()
test_feats.remove("TARGET")

# Predicting test set

In [None]:
submission = pd.read_csv("/kaggle/input/home-credit-default-risk/sample_submission.csv")

In [None]:
submission["TARGET"] = model.predict(test_app_enc[test_feats]) # lgbm predict

In [None]:
submission.to_csv("/kaggle/working/submission.csv", index=False)