# Give Me Some Credit

## Get the train dataset

In [1]:
import pandas as pd
GMSC_train_data = pd.read_csv('sources/cs-training.csv', index_col=0)
GMSC_train_data.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [2]:
Y_train = GMSC_train_data['SeriousDlqin2yrs']
GMSC_train_data.drop(columns=['SeriousDlqin2yrs'], inplace=True)
X_train = GMSC_train_data

## Preprocess the data

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),
        ('std_scaler', RobustScaler()),
    ])

X_train_std = num_pipeline.fit_transform(X_train)[0:3]

In [6]:
X_train_std.shape

(3, 10)

## Find bests hyperparameters for XGBClassifier with GridSearchCV 

In [7]:
import xgboost as xgboost
from sklearn.model_selection import StratifiedKFold, GridSearchCV

xgb = xgboost.XGBClassifier(
    nthread = 8,
    objective ='binary:logistic',
    tree_method = 'gpu_hist',
    gpu_id = 0,
#     predictor = 'cpu_predictor'
)

skf = StratifiedKFold(
    n_splits = 5,
    shuffle = True,
    random_state = 42
)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(
    xgb,
    param_grid=params,
    scoring='roc_auc',
    n_jobs=8,
    cv=skf.split(X_train_std, Y_train),
    verbose=3,
)

grid_search.fit(X_train_std, Y_train)

In [None]:
print(f'Best score : {grid_search.best_score_}')
print(f'Best estimator: {grid_search.best_estimator_}')
print(f'Best hyperparameters: {grid_search.best_params_}')

## Get the test dataset

In [None]:
GMSC_test_data = pd.read_csv('sources/cs-test.csv')
id_test = GMSC_test_data['Unnamed: 0']
GMSC_test_data.drop(columns=['SeriousDlqin2yrs'], inplace=True)
GMSC_test_data.drop(columns=['Unnamed: 0'], inplace=True)
GMSC_test_data.describe()

## Make a batch prediction on the test dataset

In [None]:
X_test_std = num_pipeline.transform(GMSC_test_data)
y_pred = grid_search.predict_proba(X_test_std)
kaggle_df = pd.DataFrame(data={'id':id_test, 'Probability':y_pred[:,1]})
kaggle_df.to_csv('submission-kaggle.csv', index=False)