In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import gc


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve



from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
lb_make = LabelEncoder()


In [14]:
def process_train_test_file():

    df = pd.read_csv('application_train.csv')
    test_df = pd.read_csv('application_test.csv')
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))

    df = df.append(test_df).reset_index()
    df = df[df['CODE_GENDER'] != 'XNA']

    target = df['TARGET']
    df = df.drop(['TARGET'],axis=1)

    Dict = {'reg oper account':0 ,'org spec account':1 , 'reg oper spec account': 2, 'not specified':3}
    df.FONDKAPREMONT_MODE = df.FONDKAPREMONT_MODE.map(Dict)
    df['FONDKAPREMONT_MODE'] = df['FONDKAPREMONT_MODE'].fillna(df['FONDKAPREMONT_MODE'].value_counts().index[0])

    Dict = {'Stone':0 ,'brick':1 , 'Block': 2, 'Panel':3, 'Mixed':4, 'Wooden':5, 'Others':6}
    df.WALLSMATERIAL_MODE = df.WALLSMATERIAL_MODE.map(Dict)
    df['WALLSMATERIAL_MODE'] = df['WALLSMATERIAL_MODE'].fillna(df['WALLSMATERIAL_MODE'].value_counts().index[0])

    Dict = {'block of flats':0 ,'terraced house':1 , 'specific housing': 2}
    df.HOUSETYPE_MODE = df.HOUSETYPE_MODE.map(Dict)
    df['HOUSETYPE_MODE'] = df['HOUSETYPE_MODE'].fillna(df['HOUSETYPE_MODE'].value_counts().index[0])

    Dict = {'No':0 ,'Yes':1 }
    df.EMERGENCYSTATE_MODE = df.EMERGENCYSTATE_MODE.map(Dict)
    df['EMERGENCYSTATE_MODE'] = df['EMERGENCYSTATE_MODE'].fillna(df['EMERGENCYSTATE_MODE'].value_counts().index[0])

    Dict = {'Cash loans':0 ,'Revolving loans':1 }
    df.NAME_CONTRACT_TYPE = df.NAME_CONTRACT_TYPE.map(Dict)

    Dict = {'M':0 ,'F':1}
    df.CODE_GENDER = df.CODE_GENDER.map(Dict)
    df['CODE_GENDER'] = df['CODE_GENDER'].fillna(df['CODE_GENDER'].value_counts().index[0])

    Dict = {'N':0 ,'Y':1}
    df.FLAG_OWN_CAR = df.FLAG_OWN_CAR.map(Dict)

    Dict = {'N':0 ,'Y':1}
    df.FLAG_OWN_REALTY = df.FLAG_OWN_REALTY.map(Dict)

    Dict = {'Unaccompanied':0 ,'Family':1, 'Spouse':2, 'partner':3, 'Children':4, 'Other_A':5, 'Other_B':6, 'Group of people':7 }
    df.NAME_TYPE_SUITE = df.NAME_TYPE_SUITE.map(Dict)
    df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].fillna(df['NAME_TYPE_SUITE'].value_counts().index[0])

    Dict = {'Working':0 ,'State servant':1, 'Commercial associate':2, 'Pensioner':3, 'Unemployed':4, 'Student':5, 'Businessman':6, 'Maternity leave':7 }
    df.NAME_INCOME_TYPE = df.NAME_INCOME_TYPE.map(Dict)

    Dict = {'Secondary / secondary special':0 ,'Higher education':1, 'Incomplete higher':2, 'Lower secondary':3, 'Academic degree':4 }
    df.NAME_EDUCATION_TYPE = df.NAME_EDUCATION_TYPE.map(Dict)

    Dict = {'Single / not married':0 ,'Married':1, 'Civil marriage':2, 'Widow':3, 'Separated':4 }
    df.NAME_FAMILY_STATUS = df.NAME_FAMILY_STATUS.map(Dict)
    df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].fillna(df['NAME_FAMILY_STATUS'].value_counts().index[0])

    Dict = {'House / apartment':0 ,'Rented apartment':1, 'With parents':2, 'Municipal apartment':3, 'Office apartment':4, 'Co-op apartment':5 }
    df.NAME_HOUSING_TYPE = df.NAME_HOUSING_TYPE.map(Dict)

    Dict = {'Laborers':0 ,'Core staff':1, 'Accountants':2, 'Managers':3, 'Drivers':4, 'Sales staff':5, 'Cleaning staff':6, 'Cooking staff':7, 'Private service staff':8, 'Medicine staff':9, 'Security staff':10, 'High skill tech staff':11, 'Waiters/barmen staff':12, 'Low-skill Laborers':13, 'Realty agents':14, 'Secretaries':15, 'IT staff':16, 'HR staff':17 }
    df.OCCUPATION_TYPE = df.OCCUPATION_TYPE.map(Dict)
    df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].fillna(df['OCCUPATION_TYPE'].value_counts().index[0])

    Dict = {'WEDNESDAY':0 ,'MONDAY':1, 'THURSDAY':2, 'SUNDAY':3, 'SATURDAY':4, 'FRIDAY':5, 'TUESDAY':6 }
    df.WEEKDAY_APPR_PROCESS_START = df.WEEKDAY_APPR_PROCESS_START.map(Dict)

    df["ORGANIZATION_TYPE"] = lb_make.fit_transform(df["ORGANIZATION_TYPE"])

    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

    df = df.fillna(df.mean())
    df['TARGET'] = target
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    del test_df
    gc.collect()
    return df

In [15]:
def process_bureau_balance_file():
    bureau = pd.read_csv('bureau.csv')
    bb = pd.read_csv('bureau_balance.csv')
    bb, bb_cat = one_hot_encoder(bb)
    bureau, bureau_cat = one_hot_encoder(bureau)
    

    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    

    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }

    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()

    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg


In [16]:
def one_hot_encoder(df):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [17]:
def kfold_lightgbm(df, num_folds, submission_file_name, stratified = False):

    train_df = df[df['TARGET'].notnull()]


    nums = train_df['TARGET'].value_counts()
    train_0 = train_df[train_df['TARGET']==0]
    train_1 = train_df[train_df['TARGET']==1]

    train_1_balanced = resample(train_1, 
                                 replace=True,    
                                 n_samples=nums.loc[0]-10000,    
                                 random_state=123) 

    train_df = pd.concat([train_0, train_1_balanced])
    train_df = train_df.sample(frac=1).reset_index(drop=True)
    test_df = df[df['TARGET'].isnull()]
    print("Start LightGBM...")
    del df
    gc.collect()

    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)

    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf = LGBMClassifier(
            nthread=4,
            n_estimators=900,
            learning_rate=0.03)

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))

    test_df['TARGET'] = sub_preds
    test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)


In [18]:

df = process_train_test_file()
bureau = process_bureau_balance_file()
print("Bureau df shape:", bureau.shape)
df = df.join(bureau, how='left', on='SK_ID_CURR')
del bureau
gc.collect()

# 

Train samples: 307511, test samples: 48744


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Bureau df shape: (305811, 112)


0

In [19]:
df.shape


(356251, 240)

In [None]:
df=df.fillna(df.mean())

In [21]:
print('**** Enter your choice ****\n      1- Logistic Regressor\n      2- K-Neighbors\n      3- Naive Basie\n      4- Random Forest\n      5- Decision Tree Classifier\n      6- XGBClassifier\n      7- SVM\n      8- LightGBM  ')
algorithm = int(input('Enter your number: '))-1


algorithm_list = [('LogisticRegressor' , LogisticRegression(solver='liblinear')) , 
                  ('KNeighbors' , KNeighborsClassifier()), 
                  ('GaussianNB' , GaussianNB()),
                  ('RandomForest' , RandomForestClassifier(n_estimators=200)),
                  ('DecisionTreeClassifier' , DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_split=5)),
                  ('XGBClassifier' , XGBClassifier()),
                  ('SVM' , SVC(kernel='linear', class_weight='balanced', probability=True)),
                  ('LightGBM' , 'S')]

if algorithm is 7:
  kfold_lightgbm(df, num_folds= 3, stratified= False,submission_file_name = algorithm_list[algorithm] [0] )
else:  
  target = df['TARGET']
  target = pd.DataFrame(target.dropna())
  train_df = df[df['TARGET'].notnull()]
  train_df.drop(['TARGET'],axis=1,inplace=True)
  train_df = train_df.fillna(train_df.mean())

  test_df = df[df['TARGET'].isnull()]
  test_df.drop(['TARGET'],axis=1,inplace=True)
  test_df = test_df.fillna(test_df.mean())
  algorithm_ = algorithm_list[algorithm] [1]   
  algorithm_.fit(train_df,target)
  y_pred = algorithm_.predict_proba(test_df)

  submission = test_df[['SK_ID_CURR']]
  submission['TARGET'] = y_pred[:,1]

  submission.to_csv(algorithm_list[algorithm] [0]+'.csv' ,index=False)



**** Enter your choice ****
      1- Logistic Regressor
      2- K-Neighbors
      3- Naive Basie
      4- Random Forest
      5- Decision Tree Classifier
      6- XGBClassifier
      7- SVM
      8- LightGBM  
Enter your number: 8
Start LightGBM...
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.793294	training's binary_logloss: 0.555083	valid_1's auc: 0.788957	valid_1's binary_logloss: 0.55864
[400]	training's auc: 0.819665	training's binary_logloss: 0.527352	valid_1's auc: 0.811334	valid_1's binary_logloss: 0.534954
[600]	training's auc: 0.839064	training's binary_logloss: 0.507349	valid_1's auc: 0.827697	valid_1's binary_logloss: 0.518103
[800]	training's auc: 0.85589	training's binary_logloss: 0.489261	valid_1's auc: 0.841979	valid_1's binary_logloss: 0.502861
Did not meet early stopping. Best iteration is:
[900]	training's auc: 0.863353	training's binary_logloss: 0.481026	valid_1's auc: 0.848275	valid_1's binary_logloss: 0.49597
Fold  1 AUC :

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
