In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import time

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
import lightgbm as lgb

from sklearn import preprocessing

In [2]:
# Thanks You Guillaume Martin for the Awesome Memory Optimizer!
# https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtypes
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else: df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

# Load Files - Thanks Cafeal
# https://www.kaggle.com/cafeal/lightgbm-trial-public-0-742
input_files = os.listdir("../input")
for filename in input_files:
    locals()[filename.rstrip('.csv')] = import_data(f'../input/{filename}')#.sample(5000)
    print(filename.rstrip('.csv'), "## Loaded and Optimized ##\n")

In [3]:
traindex = application_train.SK_ID_CURR
testdex = application_test.SK_ID_CURR
print('Train shape: {} Rows, {} Columns'.format(*application_train.shape))
print('Test shape: {} Rows, {} Columns'.format(*application_test.shape))

# Dependent Variable
y = application_train["TARGET"].values
application_train.drop("TARGET",axis=1,inplace= True)
df = pd.concat([application_train,application_test],axis=0)
del application_train, application_test ; gc.collect();

# Encoder:
categorical_columns = [f for f in df.columns if df[f].dtype == 'object']
lbl = preprocessing.LabelEncoder()
for col in categorical_columns:
    df[col] = lbl.fit_transform(df[col].astype(str))

In [4]:
len([x for x in df.columns if x in categorical_columns]) / len(categorical_columns)

In [5]:
# Aggregate Bureau_balance into Balance, and merge that into the Central Dataframe
agg_bureau_balance = bureau_balance.reset_index().groupby('SK_ID_BUREAU').agg(
    dict(MONTHS_BALANCE = ["sum","mean","max","min","std"],
         SK_ID_BUREAU = 'count'))
# Collapse Multi-Index and Preserve Origin Column Name
agg_bureau_balance.columns = pd.Index([e[0] +"_"+ e[1] for e in agg_bureau_balance.columns.tolist()])
STATUS = pd.get_dummies(bureau_balance[["SK_ID_BUREAU","STATUS"]], columns=["STATUS"]).groupby('SK_ID_BUREAU').sum()
# Float to Interger
for col in STATUS.columns: STATUS[col] = STATUS[col].astype(int)
agg_bureau_balance = pd.merge(agg_bureau_balance,STATUS,on="SK_ID_BUREAU", how= "left")
# Bureau Balance into Bureau Df
bureau = pd.merge(bureau,agg_bureau_balance, on="SK_ID_BUREAU", how= "left")
# Now Aggregate the Bureau Dataset
bureau.drop("SK_ID_BUREAU",axis=1,inplace=True)
cat = ["CREDIT_ACTIVE","CREDIT_CURRENCY","CREDIT_TYPE"]
notcat = [x for x in bureau.columns if x not in cat + ["SK_ID_CURR"]]
# Bureau Continous Variables
agg_bureau = bureau.groupby('SK_ID_CURR').agg({k:["sum","mean","max","min","std"] for k in notcat})
agg_bureau.columns = pd.Index([e[0] +"_"+ e[1] for e in agg_bureau.columns.tolist()])
df = pd.merge(df,agg_bureau, on="SK_ID_CURR", how= "left")
# Bureau Categorical Variables
lbl = preprocessing.LabelEncoder()
for col in bureau.select_dtypes(include=['object']).columns: bureau[col] = lbl.fit_transform(bureau[col].astype(str))
agg_bureau = bureau.groupby('SK_ID_CURR').agg({k: lambda x: x.mode().iloc[0] for k in cat})
agg_bureau.columns = ['{}_AGGMODE'.format(a) for a in agg_bureau.columns]
df = pd.merge(df,agg_bureau, on="SK_ID_CURR", how= "left")
categorical_columns.extend(agg_bureau.columns)
del bureau, agg_bureau_balance, bureau_balance, agg_bureau; gc.collect();

In [6]:
len([x for x in df if x in categorical_columns if x in df.columns]) / len(categorical_columns)

In [7]:
# Aggregate and merge POS_CASH_balance into Central Dataframe
agg_POS_CASH_balance = POS_CASH_balance.reset_index().groupby('SK_ID_CURR').agg(
    dict(MONTHS_BALANCE = ["sum","mean","max","min","std"],
         CNT_INSTALMENT = ["sum","mean","max","min","std"],
         CNT_INSTALMENT_FUTURE = ["sum","mean","max","min","std"],
         SK_DPD = ["sum","mean","max","min","std"],
         SK_DPD_DEF = ["sum","mean","max","min","std"],
         SK_ID_CURR = 'count'))
agg_POS_CASH_balance.columns = pd.Index([e[0] +"_"+ e[1] for e in agg_POS_CASH_balance.columns.tolist()])
NAME_CONTRACT_STATUS = pd.get_dummies(POS_CASH_balance[["SK_ID_CURR","NAME_CONTRACT_STATUS"]], columns=["NAME_CONTRACT_STATUS"]).groupby('SK_ID_CURR').sum()
for col in NAME_CONTRACT_STATUS.columns: NAME_CONTRACT_STATUS[col] = NAME_CONTRACT_STATUS[col].astype(int)
agg_POS_CASH_balance = pd.merge(agg_POS_CASH_balance,NAME_CONTRACT_STATUS, left_on="SK_ID_CURR", right_on="SK_ID_CURR", how= "left")
df = pd.merge(df,agg_POS_CASH_balance, left_on="SK_ID_CURR", right_on="SK_ID_CURR", how= "left")
del agg_POS_CASH_balance,NAME_CONTRACT_STATUS,POS_CASH_balance; gc.collect();

In [8]:
# Aggregate and merge POS_CASH_balance into Central Dataframe
# Distinguish Column Types
continuous_var = [x for x in previous_application.select_dtypes(include=['float16','float32','int8','int16','int32']).columns
                  if x not in ["SK_ID_PREV","SK_ID_CURR", "SELLERPLACE_AREA","NFLAG_LAST_APPL_IN_DAY","NFLAG_INSURED_ON_APPROVAL"]]
categorical_var = [x for x in previous_application.columns if x not in continuous_var + ['SK_ID_CURR']]
                   
 # Bureau Categorical Variables
lbl = preprocessing.LabelEncoder()
for col in categorical_var: previous_application[col] = lbl.fit_transform(previous_application[col].astype(str))
agg_previous_application = previous_application.groupby('SK_ID_CURR').agg({k: lambda x: x.mode().iloc[0] for k in categorical_var})
agg_previous_application.columns = ['{}_AGGMODE'.format(a) for a in agg_previous_application.columns]
df = pd.merge(df,agg_previous_application, on="SK_ID_CURR", how= "left")
categorical_columns.extend(agg_previous_application.columns)
print(agg_previous_application.columns)
del agg_previous_application; gc.collect();
                   
# Continous Variables
agg_previous_application = previous_application.groupby('SK_ID_CURR').agg({k:["sum","mean","max","min","std"] for k in continuous_var})
agg_previous_application.columns = pd.Index([e[0] +"_"+ e[1] for e in agg_previous_application.columns.tolist()])
df = pd.merge(df,agg_previous_application, left_on="SK_ID_CURR", right_on="SK_ID_CURR", how= "left")
del previous_application,agg_previous_application; gc.collect();

In [9]:
len([x for x in df.columns if x in categorical_columns]) / len(categorical_columns)

In [10]:


# Encoder:
# categorical_columns2 = df.select_dtypes(include=['object']).columns:
# lbl = preprocessing.LabelEncoder()
# for col in categorical_columns2:
#     df[col] = lbl.fit_transform(df[col].astype(str))
# #categorical_columns.extend(categorical_columns2)

# Optimize
df = reduce_mem_usage(df)
# Set Index (out of the way)
df.set_index("SK_ID_CURR",inplace=True)

# Final Train and Test Set
X = df.loc[traindex,:]
test = df.loc[testdex,:]

y = pd.Series(y)
import warnings
warnings.filterwarnings("ignore")

del df; gc.collect();

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=23)
        
clf = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    num_leaves=100,
    colsample_bytree=.8,
    subsample=.9,
    max_depth=-1,
    reg_alpha=.1,
    reg_lambda=.1,
    min_split_gain=.01,
    min_child_weight=2
)

clf.fit(X_train,y_train, eval_set= [(X_train,y_train), (X_valid,y_valid)],
        eval_metric='auc', verbose=50, early_stopping_rounds=150)


print("Model Evaluation Stage")
print('Valid AUC score %.6f' % roc_auc_score(y_valid, clf.predict_proba(X_valid, num_iteration=clf.best_iteration_)[:, 1]))
pred = clf.predict_proba(test, num_iteration=clf.best_iteration_)[:, 1]
sklearn_lgbm_sub = pd.DataFrame(pred,columns=["TARGET"],index=testdex)
sklearn_lgbm_sub.to_csv("sklearn_lgbm_sub.csv",index=True)

In [None]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
#     'max_depth': 9,
#     #'num_leaves': 500,
#     'learning_rate': 0.01,
#     'feature_fraction': 0.80,
#     'bagging_fraction': 0.80,
#     'bagging_freq': 5,
#     'max_bin':300,
#     #'verbose': 0,
#     #'num_threads': 1,
#     'lambda_l2': 1.5,
#     #'min_gain_to_split': 0,
#     'is_unbalance': True
    #'scale_pos_weight':0.15
}  

# LGBM Dataset Formatting 
lgtrain = lgb.Dataset(X_train, y_train, categorical_feature = categorical_columns)
lgvalid = lgb.Dataset(X_valid, y_valid, categorical_feature = categorical_columns)

modelstart = time.time()
lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,
    num_boost_round=1000,
    valid_sets=[lgtrain, lgvalid],
    valid_names=['train','valid'],
    early_stopping_rounds=500,
    verbose_eval=150
)
print("Model Evaluation Stage")
#print('Valid AUC score %.6f' % roc_auc_score(y_valid, lgb_clf.predict(X_valid)))
lgbmpred = lgb_clf.predict(test)
lgbm_sub = pd.DataFrame(lgbmpred,columns=["TARGET"],index=testdex)
lgbm_sub.to_csv("lgbm_sub.csv",index=True)
print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
# del lgvalid, lgtrain; gc.collect();

In [None]:
folds = KFold(n_splits=2, shuffle=True, random_state=546789)
oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(test.shape[0])

lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'is_unbalance': True,
    'max_depth': 7,
#     "num_leaves":30,
#     "colsample_bytree":.8,
#     'feature_fraction': 0.7,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     "subsample":.9,
    'lambda_l2': 20,
#     "min_split_gain":.01,
#     "min_child_weight":2
}  

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

    # LGBM Dataset Formatting 
    lgtrain = lgb.Dataset(X_train, y_train, categorical_feature = categorical_columns)
    lgvalid = lgb.Dataset(X_valid, y_valid, categorical_feature = categorical_columns)

    modelstart = time.time()
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=2000,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=300,
        verbose_eval=100
    )
    oof_preds[val_idx] = lgb_clf.predict(X_valid)
    sub_preds += lgb_clf.predict(test) / int(folds.n_splits)
    
    print('\nFold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(y_valid, oof_preds[val_idx])))
    del X_train, y_train, X_valid, y_valid #,lgb_clf
    gc.collect()
    
    
    
print('\nFull AUC score %.6f' % roc_auc_score(y, oof_preds)) 
lgbm_sub_oof = pd.DataFrame(sub_preds,columns=["TARGET"],index=testdex)
lgbm_sub_oof.to_csv("lgbm_sub_oof.csv",index=True,float_format='%.8f')

In [None]:
# Viz
import seaborn as sns
import matplotlib.pyplot as plt
# Feature Importance Plot
f, ax = plt.subplots(figsize=[7,10])
lgb.plot_importance(lgb_clf, max_num_features=50, ax=ax)
plt.title("Light GBM Feature Importance")
plt.savefig('feature_import.png')

In [None]:
#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["feature"] = X.columns
#     fold_importance_df["importance"] = clf.feature_importances_
#     fold_importance_df["fold"] = n_fold + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
#     del clf, trn_x, trn_y, val_x, val_y
#     gc.collect()
    
# print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))   

# test['TARGET'] = sub_preds

# test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv', index=False, float_format='%.8f')

# # Plot feature importances
# cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(
#     by="importance", ascending=False)[:50].index

# best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

# plt.figure(figsize=(8,10))
# sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
# plt.title('LightGBM Features (avg over folds)')
# plt.tight_layout()
# plt.savefig('lgbm_importances.png')

# # Plot ROC curves
# plt.figure(figsize=(6,6))
# scores = [] 
# for n_fold, (_, val_idx) in enumerate(folds.split(data)):  
#     # Plot the roc curve
#     fpr, tpr, thresholds = roc_curve(y.iloc[val_idx], oof_preds[val_idx])
#     score = roc_auc_score(y.iloc[val_idx], oof_preds[val_idx])
#     scores.append(score)
#     plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.4f)' % (n_fold + 1, score))

# plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8)
# fpr, tpr, thresholds = roc_curve(y, oof_preds)
# score = roc_auc_score(y, oof_preds)
# plt.plot(fpr, tpr, color='b',
#          label='Avg ROC (AUC = %0.4f $\pm$ %0.4f)' % (score, np.std(scores)),
#          lw=2, alpha=.8)

# plt.xlim([-0.05, 1.05])
# plt.ylim([-0.05, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('LightGBM ROC Curve')
# plt.legend(loc="lower right")
# plt.tight_layout()

# plt.savefig('roc_curve.png')