Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score,GridSearchCV, KFold, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

Importing Files

In [None]:
train_df = pd.read_csv('application_train.csv')
test_df = pd.read_csv('application_test.csv')
bureau = pd.read_csv('bureau.csv')
pos = pd.read_csv('POS_CASH_balance.csv')
credit_bal = pd.read_csv('credit_card_balance.csv')
prev_df = pd.read_csv('previous_application.csv')

### Feature Engineering  
This is important as we want to be able to incorporate additional (meaningful) information to help our model's classification capabilities. Initial exploration of using only application_train and application_test which contained only basic information on the clients yielded very poor results.

Feature Engineering for Bureau  

credit_overdue : To determine if there’s risk of overdue payments based on applicant history  
debt_credit_ratio : Flag for at-risk customers that have incurred debt over credit limit

In [None]:
bureau.columns

In [None]:
bureau_new = bureau[['SK_ID_CURR', 'CREDIT_DAY_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT']].copy()
bureau_new['CREDIT_OVERDUE'] = bureau_new['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x > 0 else 0)
bureau_new['DEBT_CREDIT_RATIO'] = bureau_new['AMT_CREDIT_SUM_DEBT'] / bureau_new['AMT_CREDIT_SUM']
bureau_new = bureau_new.drop(['CREDIT_DAY_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT'], axis=1)
bureau_new.fillna(0, inplace=True)
bureau_new = bureau_new.groupby('SK_ID_CURR').mean().reset_index()
bureau_new.head()

Feature Engineering for POS Cash Balance  

cnt_installment_future : balance number of installments to pay for previous credit  
sk_dpd_def : days past due during the month with tolerance for loans with low amount; possible flag for inability to pay

In [None]:
pos_new = pos[['SK_ID_CURR', 'CNT_INSTALMENT_FUTURE', 'SK_DPD_DEF']].copy()
pos_new = pos_new.groupby('SK_ID_CURR').mean().reset_index()
pos_new.head()

Feature Engineering for Credit Card Balance  

avg_vs_min_pay : Can be determinant of the applicant’s ability to pay above minimum requirements  
amg_balance : credit balance  
amt_credit_limit_actual : credit limit; a higher credit limit could indicate better client financial capability

In [None]:
credit_bal_new = credit_bal[['SK_ID_CURR', 'AMT_BALANCE',
                             'AMT_CREDIT_LIMIT_ACTUAL',
                             'AMT_INST_MIN_REGULARITY',
                             'AMT_PAYMENT_CURRENT',
                             'AMT_PAYMENT_TOTAL_CURRENT']].copy()
credit_bal_new['AVG_PAYMENTS'] = credit_bal_new['AMT_PAYMENT_TOTAL_CURRENT'] / (credit_bal_new['AMT_PAYMENT_TOTAL_CURRENT'] / credit_bal_new['AMT_PAYMENT_CURRENT'])
credit_bal_new['AVG_VS_MIN_PAY'] = credit_bal_new['AVG_PAYMENTS'] / credit_bal_new['AMT_INST_MIN_REGULARITY']
credit_bal_new = credit_bal_new.drop(['AMT_INST_MIN_REGULARITY',
                                      'AMT_PAYMENT_CURRENT',
                                      'AMT_PAYMENT_TOTAL_CURRENT',
                                      'AVG_PAYMENTS'], axis=1)
credit_bal_new.fillna(0, inplace=True)
credit_bal_new = credit_bal_new.groupby('SK_ID_CURR').mean().reset_index()
credit_bal_new.head()

Feature Engineering for Previous Applications  

approval_rate : Can be determinant of successful applications in the past  
refusal_rate : Can be determinant of rejected applications in the past  
avg_days_bet_rejection : How recent was the latest application rejection (if applicable)

In [None]:
prev_loan_status = prev_df[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].copy()
dum_df = pd.get_dummies(prev_loan_status, columns=["NAME_CONTRACT_STATUS"], prefix=["Type_is"])
prev_loan_stat = dum_df.groupby('SK_ID_CURR').sum().reset_index()

prev_loan_stat['Approval_Rate'] = prev_loan_stat['Type_is_Approved'] / (prev_loan_stat['Type_is_Approved'] +
                                                                        prev_loan_stat['Type_is_Refused'] +
                                                                        prev_loan_stat['Type_is_Canceled'])
prev_loan_stat['Refusal_Rate'] = prev_loan_stat['Type_is_Refused'] / (prev_loan_stat['Type_is_Approved'] +
                                                                      prev_loan_stat['Type_is_Refused'] +
                                                                      prev_loan_stat['Type_is_Canceled'])

prev_application_days = prev_df[['SK_ID_CURR','DAYS_DECISION']].copy()
prev_application_days_sum = prev_application_days.groupby('SK_ID_CURR').sum().reset_index()


prev_combined_df = prev_loan_stat.merge(prev_application_days_sum, how='left', on='SK_ID_CURR')
prev_combined_df['DAYS_DECISION'] = prev_combined_df['DAYS_DECISION']*-1

prev_combined_df['avg_days_bet_rejection'] = prev_combined_df['DAYS_DECISION'] / prev_combined_df['Type_is_Refused']

prev_combined_df.fillna(0, inplace=True)
prev_combined_df.replace([np.inf, -np.inf], 0, inplace=True)

prev_combined_df = prev_combined_df.drop(['Type_is_Approved',
                                          'Type_is_Canceled',
                                          'Type_is_Refused',
                                          'Type_is_Unused offer',
                                          'DAYS_DECISION'], axis=1)

prev_combined_df.head()

Combining with application_train and application_test datasets

In [None]:
# training data set
train_df = train_df.merge(bureau_new, how='left', on='SK_ID_CURR')
train_df = train_df.merge(pos_new, how='left', on='SK_ID_CURR')
train_df = train_df.merge(credit_bal_new, how='left', on='SK_ID_CURR')
train_df = train_df.merge(prev_combined_df, how='left', on='SK_ID_CURR')

# test data set
test_df = test_df.merge(bureau_new, how='left', on='SK_ID_CURR')
test_df = test_df.merge(pos_new, how='left', on='SK_ID_CURR')
test_df = test_df.merge(credit_bal_new, how='left', on='SK_ID_CURR')
test_df = test_df.merge(prev_combined_df, how='left', on='SK_ID_CURR')

train_df.replace([np.inf, -np.inf], 0, inplace=True)
test_df.replace([np.inf, -np.inf], 0, inplace=True)

Handling Missing Values

In [None]:
# handling missing values
list_null_cols=train_df.isnull().sum()[train_df.isnull().sum()>0].index
list_null_cols
for i in list_null_cols:
    if(train_df[i].dtype=='float64'):
        train_df.loc[train_df[i].isnull(), i]=0.0
    if(train_df[i].dtype=='int'):
        train_df.loc[train_df[i].isnull(), i]=0
    if(train_df[i].dtype=='object'):
        train_df.loc[train_df[i].isnull(), i]='N/A'


list_null_cols=test_df.isnull().sum()[test_df.isnull().sum()>0].index
list_null_cols
for i in list_null_cols:
    if(test_df[i].dtype=='float64'):
        test_df.loc[test_df[i].isnull(), i]=0.0
    if(test_df[i].dtype=='int'):
        test_df.loc[test_df[i].isnull(), i]=0
    if(test_df[i].dtype=='object'):
        test_df.loc[test_df[i].isnull(), i]='N/A'

Splitting Features and Labels

In [None]:
# split features and labels for train dataset
X_train = train_df.drop(['TARGET'], axis=1)
y_train = train_df['TARGET'].copy()
X_test = test_df.copy()

Label Encoding

In [None]:
cols = X_train.columns
str_type = []
for i in cols:
  if(X_train[i].dtype=='object' or X_train[i].dtype=='str'):
    str_type.append(i)

In [None]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

for i in str_type:
  X_train[i] = labelencoder.fit_transform(X_train[i].astype(str))
  X_test[i] = labelencoder.fit_transform(X_test[i].astype(str))

X_train.head()

In [None]:
X_train2 = X_train.drop('SK_ID_CURR', axis=1)
X_test2 = X_test.drop('SK_ID_CURR', axis=1)

Scaling

In [None]:
std_scale = StandardScaler()
X_train_std = std_scale.fit_transform(X_train2)
X_test_std = std_scale.transform(X_test2)

### MODELS  
Below we test different learning models and determine best hyper parameter settings for each in order to maximize the AUC score. For each individual model run, the scores were noted upon the upload of test data set prediction probabilities to Kaggle. 

In [None]:
cv= KFold(n_splits=5, shuffle=True, random_state=42)

#### Logistic Regression  
SCORE: 0.717

In [None]:
lr_grid = {'lr__C': [0.01, 0.1, 1, 100, 1000],
           'lr__class_weight': [None, 'balanced']}

for i in range(3):
    inner_cv = KFold(n_splits = 5, shuffle = True, random_state = i)
    outer_cv = KFold(n_splits = 5, shuffle = True, random_state = i)
    
    lr_clf = RandomizedSearchCV(estimator=lr_pipeline, param_distributions=lr_grid,
                                scoring='roc_auc', cv=inner_cv, n_jobs=-1)
    
    lr_score = cross_val_score(lr_clf, X=X_train_drp, y=y_train,
                                cv=outer_cv, scoring='roc_auc',
                                error_score="raise")
    

# determining the best parameter settings
lr_clf.fit(X_train_std, y_train)
print("Best Parameter Settings: ",lr_clf.best_params_)

#### XGBoost  
SCORE: 0.761

In [None]:
xgb_grid = {'xgb__gamma': [0.1, 1, 10],
            'xgb__max_depth':range(2,17,5)
            'xgb__n_estimators':range(200,1700,500)}
        
for i in range(3):
    inner_cv = KFold(n_splits = 5, shuffle = True, random_state = i)
    outer_cv = KFold(n_splits = 5, shuffle = True, random_state = i)
    
    xgb_clf = RandomizedSearchCV(estimator=xgb_pipeline, param_distributions=xgb_grid,
                           scoring='roc_auc', cv=inner_cv, n_jobs=-1)
    
    xgb_score = cross_val_score(xgb_clf, X=X_train_drp, y=y_train,
                                cv=outer_cv, scoring='roc_auc',
                                error_score="raise")
    print(xgb_score)

# determining best parameter settings
xgb_clf.fit(X_train_std, y_train)
print("Best Parameter Settings: ",xgb_clf.best_params_)

#### LGBM  
SCORE: 0.68

In [None]:
X_train3, X_val, y_train3, y_val = train_test_split(X_train2, y_train, test_size=0.30, random_state=42)
X_train3_std = std_scale.fit_transform(X_train3)
X_val_std = std_scale.transform(X_val)

lgbm = LGBMClassifier(
    nthread=4,
    
    colsample_bytree=0.9,
    subsample=0.8,
    max_depth=8,
    reg_alpha=0.04,
    reg_lambda=0.07,
    min_split_gain=0.04,
    min_child_weight=39,
    silent=-1,
    verbose=-1
   )

lgbm_grid={'boosting_type':['gbdt','goss'],
           'n_estimators':[10000, 15000,20000],
           'learning_rate':[0.0005, 0.001,0.01,0.05],
           'num_leaves':[20,30,40,50]}

lgbm_cl=GridSearchCV(lgbm, param_grid=lgbm_grid, cv = cv, scoring='roc_auc', n_jobs=-1)
lgbm_cl.fit(X_train3_std,y_train3, eval_set=[(X_val_std,y_val)], verbose=200)
lgbm_cl.best_params_

#### Random Forest  
SCORE: 0.652

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 100)]
max_depth = [int(x) for x in np.linspace(5, 15, num = 2)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
rf_grid = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

rf = RandomForestClassifier()
rf_cl = RandomizedSearchCV(estimator = rf, param_distributions = rf_grid, cv = cv, verbose=2, random_state=42, n_jobs = -1, scoring = 'roc_auc')
rf_cl.fit(X_train_std, y_train.values.ravel())
rf_cl.best_params_

#### AdaBoost  
SCORE: 0.668

In [None]:
ada_grid = {'n_estimators': [50, 100, 150, 200, 300, 500],
            'learning_rate': [0.5, 1.0, 1.5],
            'algorithm': ['SAMME', 'SAMME.R']}

ada = AdaBoostClassifier()
ada_clf = RandomizedSearchCV(estimator = ada, param_distributions = ada_grid, cv = cv, verbose=2, random_state=42, n_jobs = -1, scoring = 'roc_auc')
ada_clf.fit(X_train_std, y_train.values.ravel())
ada_clf.best_params_

### Stacking  
In an attempt to further increase the scores we obtained by running individual models, we employed the stacking technique which combined the all previous models in their best hyper parameter settings.  
SCORE: 0.764

In [None]:
rf = RandomForestClassifier(n_estimators=507, max_depth=15,min_samples_leaf=4,min_samples_split=5,
                       random_state=42)
lr = LogisticRegression(class_weight="balanced",C = 100, solver='liblinear')
ada = AdaBoostClassifier(n_estimators=500,learning_rate=1.5,algorithm='SAMME')
lgbm = LGBMClassifier(
    nthread=4,
    colsample_bytree=0.9,
    subsample=0.8,
    max_depth=8,
    reg_alpha=0.04,
    reg_lambda=0.07,
    min_split_gain=0.04,
    min_child_weight=39,
    boosting_type='goss',
    n_estimators=15000,
    learning_rate=0.001,
    num_leaves=30
    )

xgb_c = xgb.XGBClassifier(n_estimators=700,max_depth=2,gamma=0.01)




estimators = [
    ('xgb',xgb_c),         
    ('rf', rf),
    ('lr', lr),
    ('ada',ada),
    ('lgbm' , lgbm) ]
clf = StackingClassifier(
     estimators=estimators, final_estimator=xgb_c,
        passthrough=True, verbose=2)

clf.fit(X_train_std,y_train)




In [None]:
df = pd.DataFrame()
df['SK_ID_CURR']=test_df['SK_ID_CURR']
df['TARGET']= out[:,1]
df.to_csv('output3.csv',index=False)

### Other Initial Explorations - but incorrect approach which we did not pursue further

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve, auc, matthews_corrcoef,cohen_kappa_score,make_scorer
from sklearn.model_selection import cross_val_score,GridSearchCV,KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report, confusion_matrix,confusion_matrix, ConfusionMatrixDisplay,roc_curve, auc, precision_recall_curve, recall_score, precision_score, f1_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline

app_train= pd.read_csv('application_train.csv')
app_train.head(n=5)

list_null_cols=app_test.isnull().sum()[app_test.isnull().sum()>0].index
list_null_cols
for i in list_null_cols:
    if(app_test[i].dtype=='float64'):
        app_test.loc[app_test[i].isnull(), i]=0.0
    if(app_test[i].dtype=='int'):
        app_test.loc[app_test[i].isnull(), i]=0
    if(app_test[i].dtype=='object'):
        app_test.loc[app_test[i].isnull(), i]='N/A'
        
        
X_train=app_train.loc[:, app_train.columns != 'TARGET']
y_train=app_train.loc[:, app_train.columns == 'TARGET']
X_test=app_test.loc[:, app_test.columns != 'TARGET']



cols= X_train.columns
string_type=[]
for i in cols:
    if X_train[i].dtype=='object':
        string_type.append(i)
#print(string_type)

other_type=[]
for i in cols:
    if i not in string_type:
        other_type.append(i)
#print(other_type)


enc = OneHotEncoder(handle_unknown = 'ignore')
enc.fit(X_train[string_type])
enc.fit(X_test[string_type])

dummy1=enc.transform(X_train[string_type]).toarray()
dummy=pd.DataFrame(dummy1,columns=enc.get_feature_names(string_type))
non_dummy=X_train[other_type]
X_train_trans=pd.concat([dummy, non_dummy], axis=1)


from sklearn.decomposition import PCA
pca = PCA(4)
X_train_trans2 = pca.fit_transform(X_train_trans)
X_test_trans = pca.transform(X_test_trans)






inner_cv= KFold(n_splits=5, shuffle=True, random_state=42)
outer_cv= KFold(n_splits=5, shuffle=True, random_state=42)
minmax = MinMaxScaler()
std = StandardScaler()
#scoring = make_scorer(matthews_corrcoef)
#scoring = make_scorer(cohen_kappa_score)
#scoring = make_scorer(f1_score)
scoring = 'roc_auc'

knn=KNeighborsClassifier()
lr = LogisticRegression()
svm =SVC()
dt=DecisionTreeClassifier()
rf=RandomForestClassifier()
xg = XGBClassifier()
   

            
knn_grid={ 'n_neighbors': range(1,5,1)}

lr_grid= { 'solver': ['newton-cg', 'lbfgs'],
                        'penalty': ['l1', 'l2', 'elasticnet'], #['l1', 'l2', 'elasticnet']
                        'C': [ .1,1.0 ]
                        'l1_ratio': [0,0.5,1]
         }

svm_grid= {'kernel': [ 'rbf'], 'gamma': [1e-3, 1e-4]},{'kernel': [ 'linear'], 'C': [0.1,1, 10]}
        

dt_grid = {'max_depth': range(1,3)
           'criterion': ['mse', 'mae'],
           'min_samples_split' : range(2,5),
           'min_samples_leaf' : range(2,5)
        }

rf_grid = {'max_depth': range(1,3)
           'criterion': ['mse', 'mae'],
           'min_samples_split' : range(2,5),
           'min_samples_leaf' : range(2,5),
           'n_estimators': range(5,10,1)
        }
xg_grid = {'max_depth': range(1,3)
        }



import warnings

with warnings.catch_warnings(record=True):
    knn_rg=make_pipeline(minmax, GridSearchCV(knn, param_grid=knn_grid, cv = inner_cv, scoring=scoring))
    svm_rg=make_pipeline(minmax, GridSearchCV(svm, param_grid=svm_grid, cv = inner_cv, scoring=scoring))
    lr_rg=make_pipeline(std, GridSearchCV(lr, param_grid=lr_grid, cv = inner_cv, scoring=scoring))
    dt_rg=GridSearchCV(dt, param_grid=dt_grid, cv = inner_cv, scoring=scoring)
    rf_rg=GridSearchCV(rf, param_grid=rf_grid, cv = inner_cv, scoring=scoring)
    xg_rg=GridSearchCV(xg, param_grid=xg_grid, cv = inner_cv, scoring=scoring)


    knn_score=cross_val_score(knn_rg, X=X_train_trans2, y=y_train) 
    svm_score=cross_val_score(svm_rg, X=X_train_trans2, y=y_train)
    lr_score=cross_val_score(lr_rg, X=X_train_trans2, y=y_train)
    dt_score=cross_val_score(dt_rg, X=X_train_trans2, y=y_train)
    rf_score=cross_val_score(rf_rg, X=X_train_trans2, y=y_train)
    xg_score=cross_val_score(xg_rg, X=X_train_trans2, y=y_train)
   

    print((knn_score.mean()))
    print((svm_score.mean())) 
    print((lr_score.mean())) 
    print((dt_score.mean())) 
    print((rf_score.mean())) 
    print((xg_score.mean())) 





    
    
    

