In [None]:
import numpy as np
import pandas as pd
import lightgbm as gbm


In [None]:
train = pd.read_csv("../input/application_train.csv") 
test = pd.read_csv('../input/application_test.csv')
bureau = pd.read_csv('../input/bureau.csv')
bureau_balance = pd.read_csv('../input/bureau_balance.csv')
credit_card_balance = pd.read_csv('../input/credit_card_balance.csv')
pos_cash_balance = pd.read_csv('../input/POS_CASH_balance.csv')
previous_applications = pd.read_csv('../input/previous_application.csv')

    

In [None]:
train.head()

In [None]:
bureau.head()

In [None]:
bureau_balance.head()

In [None]:
credit_card_balance.head()

In [None]:
pos_cash_balance.head()

In [None]:
previous_applications.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

def encoding_missing_values(df):
    label_encoder = LabelEncoder()
    categorical_variables = df.select_dtypes('object').columns.tolist()
    for column in categorical_variables:
        df[column] = label_encoder.fit_transform(df[column].astype(str))
    numerical_variables=df.select_dtypes(['float64','int64']).columns.tolist()
    df[numerical_variables]= df[numerical_variables].fillna(0)
    return df
    

In [None]:
train_shape = train.shape[0]
target = train.TARGET.copy()
train = train.drop(labels = ['TARGET'], axis = 1)

train = encoding_missing_values(train)
test = encoding_missing_values(test)
bureau = encoding_missing_values(bureau)
bureau_balance = encoding_missing_values(bureau_balance)
credit_card_balance = encoding_missing_values(credit_card_balance)
pos_cash_balance = encoding_missing_values(pos_cash_balance)
previous_applications = encoding_missing_values(previous_applications)

In [None]:
bureau_labels=[n+'_'+l for n in bureau.columns.tolist() if n!='SK_ID_CURR' for l in ['mean','count','median','max']]
bureau_stats=bureau.groupby('SK_ID_CURR').agg(['mean','count','median','max']).reset_index()
bureau_stats.columns=['SK_ID_CURR']+bureau_labels
bureau_stats.head()

In [None]:
credit_card_balance_labels=[n+'_'+l for n in credit_card_balance.columns.tolist() if n!='SK_ID_CURR' for l in ['mean','count','median','max']]
credit_card_stats=credit_card_balance.groupby('SK_ID_CURR').agg(['mean','count','median','max']).reset_index()
credit_card_stats.columns=['SK_ID_CURR']+credit_card_balance_labels
credit_card_stats.head()

In [None]:
pos_cash_balance_labels=[n+'_'+l for n in pos_cash_balance.columns.tolist() if n!='SK_ID_CURR' for l in ['mean','count','median','max']]
pos_cash_stats=pos_cash_balance.groupby('SK_ID_CURR').agg(['mean','count','median','max']).reset_index()
pos_cash_stats.columns=['SK_ID_CURR']+ pos_cash_balance_labels
pos_cash_stats.head()

In [None]:
previous_applications_labels=[n+'_'+l for n in previous_applications.columns.tolist() if n!='SK_ID_CURR' for l in ['mean','count','median','max']]
previous_applications_stats=previous_applications.groupby('SK_ID_CURR').agg(['mean','count','median','max']).reset_index()
previous_applications_stats.columns=['SK_ID_CURR']+ previous_applications_labels
previous_applications_stats.head()

In [None]:
train.head()

In [None]:
train_test_combo = (train.append(test).merge(bureau_stats, on = 'SK_ID_CURR', how = 'left')
                                       .merge(credit_card_stats, on = 'SK_ID_CURR', how = 'left')
                                        .merge(pos_cash_stats, on = 'SK_ID_CURR', how = 'left')
                                        .merge(previous_applications_stats, on = 'SK_ID_CURR', how = 'left'))

In [None]:
train_test_combo.head()

In [None]:
train_test_combo.drop(labels=['SK_ID_CURR'],axis=1,inplace=True)

In [None]:
train_df = train_test_combo.iloc[:train_shape,:]
test_df = train_test_combo.iloc[train_shape:,:]

In [None]:
param = {'objective' : 'binary',
          'boosting_type': 'gbdt',
          'metric' : 'auc',
          'nthread' : 4,
          'shrinkage_rate':0.025,
          'max_depth':8,
          'min_data_in_leaf':100,
          'min_child_weight': 2,
          'bagging_fraction':0.75,
          'feature_fraction':0.75,
          'min_split_gain':.01,
          'lambda_l1':1,
          'lambda_l2':1,
          'num_leaves':36}    





In [None]:
from sklearn.model_selection import train_test_split
train_x, valid_x, train_y, valid_y = train_test_split(train_df, target, test_size=0.2, shuffle=True)

In [None]:
train_data=gbm.Dataset(train_x,label=train_y)
valid_data=gbm.Dataset(valid_x,label=valid_y)

In [None]:
lgbm = gbm.train(param,
                 train_data,
                 1000,
                 valid_sets=valid_data,
                 early_stopping_rounds= 100,
                 verbose_eval= 10
                 )

In [None]:
from sklearn.metrics import roc_auc_score
train_y_lgb = lgbm.predict(train_x)
val_y_lgb = lgbm.predict(valid_x)
print("ROC for Train ROC" , roc_auc_score(train_y,train_y_lgb))
print("ROC for Validation ROC",roc_auc_score(valid_y,val_y_lgb))


In [None]:
predictions = lgbm.predict(test_df)
submission = pd.read_csv('../input/sample_submission.csv')


submission.TARGET = predictions

submission.to_csv('lgbm_submission.csv', index=False)