# Give Me Some Credit

## Get the train dataset

In [None]:
import pandas as pd
GMSC_train_data = pd.read_csv('sources/cs-training.csv', index_col=0)
GMSC_train_data.describe()

In [None]:
Y_train = GMSC_train_data['SeriousDlqin2yrs']
GMSC_train_data.drop(columns=['SeriousDlqin2yrs'], inplace=True)
X_train = GMSC_train_data

## Preprocess the data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),
        ('std_scaler', RobustScaler()),
    ])

X_train_std = num_pipeline.fit_transform(X_train)

## Find bests hyperparameters for Naive Bayes with GridSearchCV 

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, GridSearchCV

clf = GaussianNB()

skf = StratifiedKFold(
    n_splits = 5,
    shuffle = True,
    random_state = 42
)

params = {
    'min_child_weight': [5, 10],
    'gamma': [2, 5, 10],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'max_depth': [4, 5, 6]
}

grid_search = GridSearchCV(
    clf,
    param_grid=params,
    scoring='roc_auc',
    n_jobs=8,
    cv=skf.split(X_train_std, Y_train),
    verbose=3,
)

clf.fit(X_train_std, Y_train)

In [None]:
print(f'Best score : {grid_search.best_score_}')
print(f'Best estimator: {grid_search.best_estimator_}')

## Get the test dataset

In [None]:
GMSC_test_data = pd.read_csv('sources/cs-test.csv')
id_test = GMSC_test_data['Unnamed: 0']
GMSC_test_data.drop(columns=['SeriousDlqin2yrs'], inplace=True)
GMSC_test_data.drop(columns=['Unnamed: 0'], inplace=True)
GMSC_test_data.describe()

## Make a batch prediction on the test dataset

In [None]:
X_test_std = num_pipeline.transform(GMSC_test_data)
y_pred = clf.predict_proba(X_test_std)
kaggle_df = pd.DataFrame(data={'id':id_test, 'Probability':y_pred[:,1]})
kaggle_df.to_csv('submission-kaggle.csv', index=False)