# Give Me Some Credit - Kaggle Competition

Improve on the state of the art in credit scoring by predicting the probability that somebody will experience financial distress in the next two years.

## Get the train dataset

In [1]:
import pandas as pd
GMSC_train_data = pd.read_csv('sources/cs-training.csv', index_col=0)
GMSC_train_data.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [2]:
Y_train = GMSC_train_data['SeriousDlqin2yrs']
GMSC_train_data.drop(columns=['SeriousDlqin2yrs'], inplace=True)
X_train = GMSC_train_data

## Preprocess the data

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

X_train_std = num_pipeline.fit_transform(X_train)

## Find bests hyperparameters for XGBClassifier 
### Cross-validation with StratifiedKFold

In [4]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(
    n_splits = 10,
    shuffle = True,
    random_state = 42
)

### GridSearchCV and RandomizedSearchCV

In [5]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier()

In [6]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'bootstrap' : [True, False],
#     'class_weight' : [None],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [5, 10, 20, 30, 40, 50, None],
    'max_features' : ['auto', 'sqrt'],
#     'max_leaf_nodes' : [None],
#     'min_impurity_decrease' : [0.0],
#     'min_impurity_split' : [None],
    'min_samples_leaf' : [1, 2, 4],
    'min_samples_split' : [2, 5, 10],
#     'min_weight_fraction_leaf' : [0.0],
    'n_estimators' : [200, 400],
#     'oob_score' : [False],
#     'warm_start' : [False]
}

search_cv = RandomizedSearchCV(
    estimator = clf,
    param_distributions = params,
    scoring = 'roc_auc',
    n_iter = 15,
    n_jobs = 8,
    cv = skf.split(X_train_std, Y_train),
    verbose = 3,
    random_state = 42,
)

In [7]:
search_cv.fit(X_train_std, Y_train)

Fitting 10 folds for each of 15 candidates, totalling 150 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:  2.0min
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:  9.1min
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed: 12.1min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x0000021485D2E048>,
                   error_score='raise-deprecating',
                   estimator=ExtraTreesClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fract...
                   iid='warn', n_iter=15, n_jobs=8,
                   param_distributions={

### Bests hyperparameters for XGBClassifier 

In [8]:
print(f'Best score : {search_cv.best_score_}')
print(f'Best estimator: {search_cv.best_estimator_}')
print(f'Best hyperparameters: {search_cv.best_params_}')

Best score : 0.856747327554313
Best estimator: ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=30, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=10,
                     min_weight_fraction_leaf=0.0, n_estimators=400,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Best hyperparameters: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 30, 'criterion': 'entropy', 'bootstrap': True}


In [9]:
# print(f'CV results hyperparameters: {search_cv.cv_results_}')

## Get the test dataset

In [10]:
GMSC_test_data = pd.read_csv('sources/cs-test.csv')
id_test = GMSC_test_data['Unnamed: 0']
GMSC_test_data.drop(columns=['SeriousDlqin2yrs'], inplace=True)
GMSC_test_data.drop(columns=['Unnamed: 0'], inplace=True)

GMSC_test_data.describe()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,101503.0,101503.0,101503.0,101503.0,81400.0,101503.0,101503.0,101503.0,101503.0,98877.0
mean,5.31,52.405436,0.45377,344.47502,6855.036,8.453514,0.296691,1.013074,0.270317,0.769046
std,196.156039,14.779756,4.538487,1632.595231,36508.6,5.1441,4.515859,1.110253,4.503578,1.136778
min,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.030131,41.0,0.0,0.173423,3408.0,5.0,0.0,0.0,0.0,0.0
50%,0.152586,52.0,0.0,0.36426,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.564225,63.0,0.0,0.851619,8200.0,11.0,0.0,2.0,0.0,1.0
max,21821.0,104.0,98.0,268326.0,7727000.0,85.0,98.0,37.0,98.0,43.0


## Make a batch prediction on the test dataset

In [13]:
# GMSC_test_data = preprocess(GMSC_test_data)
X_test_std = num_pipeline.transform(GMSC_test_data)
y_pred = search_cv.predict_proba(X_test_std)
kaggle_df = pd.DataFrame(data={'id':id_test, 'Probability':y_pred[:,1]})
kaggle_df.to_csv('submission-kaggle-etc.csv', index=False)