# Give Me Some Credit - Kaggle Competition

Improve on the state of the art in credit scoring by predicting the probability that somebody will experience financial distress in the next two years.

## Get the train dataset

In [1]:
import pandas as pd
GMSC_train_data = pd.read_csv('sources/cs-training.csv', index_col=0)
GMSC_train_data.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [2]:
Y_train = GMSC_train_data['SeriousDlqin2yrs']
X_train = GMSC_train_data.drop(columns=['SeriousDlqin2yrs'], axis=1)

## Preprocess the data

In [3]:
# def preprocess(dataframe):
#     dataframe.loc[dataframe['NumberOfTime30-59DaysPastDueNotWorse'] >= 95,'NumberOfTime30-59DaysPastDueNotWorse'] = 18
#     dataframe.loc[dataframe['NumberOfTime60-89DaysPastDueNotWorse'] >= 95,'NumberOfTime60-89DaysPastDueNotWorse'] = 18
#     dataframe.loc[dataframe['NumberOfTimes90DaysLate'] >= 95,'NumberOfTimes90DaysLate'] = 18
    
#     dataframe['NumberOfTimeGlobal'] = dataframe['NumberOfTime30-59DaysPastDueNotWorse'] + dataframe['NumberOfTime60-89DaysPastDueNotWorse']*2 + dataframe['NumberOfTimes90DaysLate']*3
    
#     dataframe.loc[dataframe['age'] < 60,'IsOld'] = 0
#     dataframe.loc[dataframe['age'] >= 60,'IsOld'] = 1
    
#     return dataframe

# X_train = preprocess(X_train)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

X_train_std = num_pipeline.fit_transform(X_train)

## Find bests hyperparameters for XGBClassifier 
### Cross-validation with StratifiedKFold

In [5]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(
    n_splits = 5,
    shuffle = True,
    random_state = 42
)

### GridSearchCV and RandomizedSearchCV

In [6]:
GridSearchCV_Activated = True # if set to False Then RandomizedSearchCV is used

In [7]:
from xgboost.sklearn import XGBClassifier

xgb = XGBClassifier(
    booster = 'gbtree',  # default = gbtree
    objective = 'binary:logistic', # default = reg:squarederror
)

In [8]:
# Best score : 0.8667878600445456
# from xgboost.sklearn import XGBClassifier
# xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#               colsample_bynode=1, colsample_bytree=0.6, eta=0.005, gamma=0.6,
#               learning_rate=0.1, max_delta_step=0, max_depth=5,
#               min_child_weight=9, missing=None, n_estimators=100, n_jobs=1,
#               nthread=None, objective='binary:logistic', random_state=0,
#               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
#               silent=None, subsample=0.8, verbosity=1)
# xgb.fit(X_train_std, Y_train)

In [9]:
if not GridSearchCV_Activated:
    from sklearn.model_selection import RandomizedSearchCV
    params = {
        'max_depth': [4, 5, 6, 7, 8, 9, 10], # default = 6
        'min_child_weight': [2, 3, 4, 5, 6, 7, 8, 9, 10],  # default = 1
        'gamma': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2], # default = 0
        'subsample': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], # default = 1
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], # default = 1
        'lambda' : [1, 1.05, 1.10, 1.15], # default = 1
        'eta' : [0.001], # alias learning_rate default  = 0.3 
        'reg_alpha' : [1, 1.2, 1.4],
        'reg_lambda' : [1, 1.2, 1.4],
    }
    
    search_cv = RandomizedSearchCV(
        estimator = xgb,
        param_distributions = params,
        scoring = 'roc_auc',
        n_iter = 1000,
        n_jobs = 8,
        cv = skf.split(X_train_std, Y_train),
        verbose = 3,
        random_state = 42,
    )

In [10]:
if GridSearchCV_Activated:
    from sklearn.model_selection import GridSearchCV
    params = {
        'max_depth': [4, 5, 6, 7, 8, 9, 10], # default = 6
        'min_child_weight': [2, 3, 4, 5, 6, 7, 8, 9, 10],  # default = 1
        'gamma': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2], # default = 0
        'subsample': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], # default = 1
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], # default = 1
        'lambda' : [1, 1.05, 1.10, 1.15], # default = 1
        'eta' : [0.001], # alias learning_rate default  = 0.3 
        'reg_alpha' : [1, 1.2, 1.4],
        'reg_lambda' : [1, 1.2, 1.4],
    }

    search_cv = GridSearchCV(
        estimator = xgb,
        param_grid = params,
        scoring = 'roc_auc',
        n_jobs = 8,
        cv = skf.split(X_train_std, Y_train),
        verbose = 3,
    )

In [11]:
search_cv.fit(X_train_std, Y_train)

Fitting 5 folds for each of 1000188 candidates, totalling 5000940 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:   17.8s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  3.9min
[Parallel(n_jobs=8)]: Done 496 tasks      | elapsed:  7.0min
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed: 11.0min
[Parallel(n_jobs=8)]: Done 1136 tasks      | elapsed: 15.9min
[Parallel(n_jobs=8)]: Done 1552 tasks      | elapsed: 21.7min
[Parallel(n_jobs=8)]: Done 2032 tasks      | elapsed: 28.3min
[Parallel(n_jobs=8)]: Done 2576 tasks      | elapsed: 35.9min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed: 45.2min
[Parallel(n_jobs=8)]: Done 3856 tasks      | elapsed: 56.2min
[Parallel(n_jobs=8)]: Done 4592 tasks      | elapsed: 68.3min
[Parallel(n_jobs=8)]: Done 5392 tasks      | elapsed: 81.3min
[Parallel(n_jobs=8)]: Done 6256 tasks      | elapsed: 97.1min
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed: 114.7min

KeyboardInterrupt: 

### Bests hyperparameters for XGBClassifier 

In [None]:
print(f'Best score : {search_cv.best_score_}')
print(f'Best estimator: {search_cv.best_estimator_}')
print(f'Best hyperparameters: {search_cv.best_params_}')

## Get the test dataset

In [None]:
GMSC_test_data = pd.read_csv('sources/cs-test.csv')
id_test = GMSC_test_data['Unnamed: 0']
GMSC_test_data.drop(columns=['SeriousDlqin2yrs'], inplace=True)
GMSC_test_data.drop(columns=['Unnamed: 0'], inplace=True)

GMSC_test_data.describe()

## Make a batch prediction on the test dataset

In [None]:
# GMSC_test_data = preprocess(GMSC_test_data)
X_test_std = num_pipeline.transform(GMSC_test_data)
y_pred = search_cv.predict_proba(X_test_std)
kaggle_df = pd.DataFrame(data={'id':id_test, 'Probability':y_pred[:,1]})
kaggle_df.to_csv('submission-kaggle.csv', index=False)