# Give Me Some Credit - Kaggle Competition

Improve on the state of the art in credit scoring by predicting the probability that somebody will experience financial distress in the next two years.

## Get the train dataset

In [None]:
import pandas as pd
GMSC_train_data = pd.read_csv('sources/cs-training.csv', index_col=0)
GMSC_train_data.describe()

In [None]:
Y_train = GMSC_train_data['SeriousDlqin2yrs']
X_train = GMSC_train_data.drop(columns=['SeriousDlqin2yrs'], axis=1)

## Preprocess the data

In [None]:
# def preprocess(dataframe):
#     dataframe.loc[dataframe['NumberOfTime30-59DaysPastDueNotWorse'] >= 95,'NumberOfTime30-59DaysPastDueNotWorse'] = 18
#     dataframe.loc[dataframe['NumberOfTime60-89DaysPastDueNotWorse'] >= 95,'NumberOfTime60-89DaysPastDueNotWorse'] = 18
#     dataframe.loc[dataframe['NumberOfTimes90DaysLate'] >= 95,'NumberOfTimes90DaysLate'] = 18
    
#     dataframe['NumberOfTimeGlobal'] = dataframe['NumberOfTime30-59DaysPastDueNotWorse'] + dataframe['NumberOfTime60-89DaysPastDueNotWorse']*2 + dataframe['NumberOfTimes90DaysLate']*3
    
#     dataframe.loc[dataframe['age'] < 60,'IsOld'] = 0
#     dataframe.loc[dataframe['age'] >= 60,'IsOld'] = 1
    
#     return dataframe

# X_train = preprocess(X_train)
# Réactiver preprocess sur dataset test

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

X_train_std = num_pipeline.fit_transform(X_train)

## Find bests hyperparameters for LightGBM 
### Cross-validation with StratifiedKFold

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(
    n_splits = 10,
    shuffle = True,
    random_state = 42
)

### GridSearchCV and RandomizedSearchCV

In [None]:
import lightgbm as lgb

clf = lgb.LGBMClassifier()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'learning_rate': [0.001],
    'boosting_type' : ['gbdt'], # ['gbdt', 'dart', 'goss', 'rf'],
    'objective' : ['binary'],
    'n_estimators': [200],
    'num_leaves': [150, 300, 500],
    'random_state' : [42], # Updated from 'seed'
    'colsample_bytree' : [0.5, 0.55, 0.6],
    'subsample' : [0.6, 0.65, 0.7],
    'reg_alpha' : [1, 1.2, 1.4],
    'reg_lambda' : [1, 1.2, 1.4],
}

search_cv = RandomizedSearchCV(
    estimator = clf,
    param_distributions = params,
    scoring = 'roc_auc',
    n_iter = 1000,
    n_jobs = 8,
    cv = skf.split(X_train_std, Y_train),
    verbose = 3,
    random_state = 42,
)

In [None]:
search_cv.fit(X_train_std, Y_train)

### Bests hyperparameters for LightGBM 

In [None]:
print(f'Best score : {search_cv.best_score_}')
print(f'Best estimator: {search_cv.best_estimator_}')
print(f'Best hyperparameters: {search_cv.best_params_}')

In [None]:
# print(f'CV results hyperparameters: {search_cv.cv_results_}')

## Get the test dataset

In [None]:
GMSC_test_data = pd.read_csv('sources/cs-test.csv')
id_test = GMSC_test_data['Unnamed: 0']
GMSC_test_data.drop(columns=['SeriousDlqin2yrs'], inplace=True)
GMSC_test_data.drop(columns=['Unnamed: 0'], inplace=True)

GMSC_test_data.describe()

## Make a batch prediction on the test dataset

In [None]:
# GMSC_test_data = preprocess(GMSC_test_data)
X_test_std = num_pipeline.transform(GMSC_test_data)
y_pred = search_cv.predict_proba(X_test_std)
kaggle_df = pd.DataFrame(data={'id':id_test, 'Probability':y_pred[:,1]})
kaggle_df.to_csv('submission-kaggle-lgbm.csv', index=False)