# Initialisation des packages et chargement du jeu de données

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

In [None]:
dir_file = 'C:/Users/paul.bonte/Formation OC/P7_Bonte_Paul/data/'

test = pd.read_csv(dir_file + 'application_test.csv')
train = pd.read_csv(dir_file + 'application_train.csv')
bureau = pd.read_csv(dir_file + 'bureau.csv')
bureau_balance = pd.read_csv(dir_file + 'bureau_balance.csv')
cc = pd.read_csv(dir_file + 'credit_card_balance.csv')
instal_paiement = pd.read_csv(dir_file + 'installments_payments.csv')
POS_cash = pd.read_csv(dir_file + 'POS_CASH_balance.csv')

In [None]:
print(test.info())

In [None]:
print(train.info())

In [None]:
print(bureau.info())

In [None]:
print(bureau_balance.info())

In [None]:
print(cc.info())

In [None]:
print(instal_paiement.info())

In [None]:
print(POS_cash.info())

# Feature engineering

## Variable de regroupement

In [None]:
print("train data: %s rows and %s cols"%train.shape)
print("Credit Card Balance: %s rows and %s cols"%cc.shape)
print("Bureau: %s rows and %s cols"%bureau.shape)
print("Installments Payments: %s rows and %s cols"%instal_paiement.shape)
print("POS_cash: %s rows and %s cols"%POS_cash.shape)

In [None]:
print(train.SK_ID_CURR.nunique())
print(cc.SK_ID_CURR.nunique())
print(bureau.SK_ID_CURR.nunique())
print(instal_paiement.SK_ID_CURR.nunique())
print(POS_cash.SK_ID_CURR.nunique())

## Création des features

In [None]:
# Dépenses du mois avant crédit / limite de retrait mensuel : Reste t il de l'argent dispo 
cc['AMT_DRAWINGS_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_DRAWINGS_CURRENT,cc.AMT_CREDIT_LIMIT_ACTUAL)]
# retrait cash du mois avant crédit / limite de retrait mensuel
cc['AMT_DRAWINGS_ATM_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_DRAWINGS_ATM_CURRENT,cc.AMT_CREDIT_LIMIT_ACTUAL)]
# Autres retrait mois avant crédit /  limite de retrait mensuel
cc['AMT_DRAWINGS_OTHER_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_DRAWINGS_OTHER_CURRENT,cc.AMT_CREDIT_LIMIT_ACTUAL)]
# Montant tiré ou acheté des biens au cours du mois du crédit précédent /  limite de retrait mensuel
cc['AMT_DRAWINGS_POS_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_DRAWINGS_POS_CURRENT,cc.AMT_CREDIT_LIMIT_ACTUAL)]
# Montant restant sur le crédit précédent / Montant à recevoir sur le crédit précédent
cc['AMT_PRINCIPAL_RECEIVABLE_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_RECEIVABLE_PRINCIPAL,cc.AMT_RECIVABLE)]

# On récupère la moyenne des principaux indicateurs du doc cc : balance carte bleue, retraits, ...
cc_use = cc.groupby(['SK_ID_CURR'],as_index=False).agg({'AMT_BALANCE': np.mean,
                                                        'AMT_DRAWINGS_PCT':np.mean,
                                                        'AMT_DRAWINGS_ATM_PCT':np.mean,
                                                        'AMT_DRAWINGS_OTHER_PCT':np.mean,
                                                        'AMT_DRAWINGS_POS_PCT':np.mean,
                                                        'AMT_PRINCIPAL_RECEIVABLE_PCT':np.mean,
                                                        'CNT_DRAWINGS_ATM_CURRENT':np.mean,
                                                        'CNT_DRAWINGS_CURRENT':np.mean,
                                                        'CNT_DRAWINGS_OTHER_CURRENT':np.mean,
                                                        'CNT_DRAWINGS_POS_CURRENT':np.mean,
                                                        'SK_DPD':np.mean,
                                                        'SK_DPD_DEF':np.mean})
# Date échéance prévu - date de paiement : retard ?
instal_paiement['DAYS_INSTALMENT_DIFF'] = instal_paiement['DAYS_INSTALMENT'] - instal_paiement['DAYS_ENTRY_PAYMENT']
# Ce que le client a réellement payé sur le crédit précédent / ce qui aurait du l'etre
instal_paiement['AMT_PATMENT_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(instal_paiement.AMT_PAYMENT,instal_paiement.AMT_INSTALMENT)]
# On récupère la moyenne des principaux indicateurs du doc 
pmts_use = instal_paiement.groupby(['SK_ID_CURR'],as_index=False).agg({'DAYS_INSTALMENT_DIFF':np.mean,
                                                            'AMT_PATMENT_PCT':np.mean})   

## Merge et récupération du Df final

In [None]:
# Merge Credit Card Balance & Installment Payments
train = train.merge(cc_use,on='SK_ID_CURR',how='left')
train = train.merge(pmts_use,on='SK_ID_CURR',how='left')
train.to_csv('train.csv', index=False)

In [None]:
print(train.shape)

# EDA

## Exploration des données

In [None]:
train.describe()

In [None]:
train.hist(figsize=(150,150))

In [None]:
for col in train.columns:
    if train[col].dtype == 'object':
        print("object column %s have %s unique values"%(str(col),train[col].nunique()))
        if train[col].nunique() <= 3:
            print(train[col].value_counts(dropna = False))
            print('-------------')
        else:
            ax = sns.catplot(x=col, kind="count", data=train,height=6, aspect=2.2)
            for axes in ax.axes.flat:
                axes.set_xticklabels(axes.get_xticklabels(),rotation = 45,horizontalalignment = 'right')

In [None]:
train['CODE_GENDER'].unique()

In [None]:
train = train[train['CODE_GENDER'] != 'XNA']

In [None]:
train['ORGANIZATION_TYPE'].unique()

In [None]:
train['ORGANIZATION_TYPE'] = train.ORGANIZATION_TYPE.apply(lambda x: 'Industry' if 'Industry' in x else x)
train['ORGANIZATION_TYPE'] = train.ORGANIZATION_TYPE.apply(lambda x: 'Trade' if 'Trade' in x else x)
train['ORGANIZATION_TYPE'] = train.ORGANIZATION_TYPE.apply(lambda x: 'Business Entity' if 'Business Entity' in x else x)
train['ORGANIZATION_TYPE'] = train.ORGANIZATION_TYPE.apply(lambda x: 'Transport' if 'Transport' in x else x)
print(train['ORGANIZATION_TYPE'].unique())

In [None]:
sns.catplot(x='ORGANIZATION_TYPE', kind="count", data=train,height=6, aspect=2.2)

## Données manquantes

In [None]:
# Ajouter dans une variable un dataframe comprenant le % de valeur manquante par colonne
Nb_missing = pd.DataFrame(train.isna().sum()/train.shape[0]*100)
Nb_missing.columns = ["p_missing"]
Nb_missing.sort_values(ascending = False, by = 'p_missing', inplace=True)

# Représentation graphique
fig1 = sns.barplot(x = Nb_missing.index, y = "p_missing" , data = Nb_missing)
sns.set(rc={'figure.figsize':(10,20)})
plt.xticks(rotation=90)
print(fig1)

In [None]:
train = train[train.columns[train.isnull().mean() < 0.4]]

In [None]:
# Ajouter dans une variable un dataframe comprenant le % de valeur manquante par colonne
Nb_missing = pd.DataFrame(train.isna().sum()/train.shape[0]*100)
Nb_missing.columns = ["p_missing"]
Nb_missing.sort_values(ascending = False, by = 'p_missing', inplace=True)

# Représentation graphique
fig1 = sns.barplot(x = Nb_missing.index, y = "p_missing" , data = Nb_missing)
sns.set(rc={'figure.figsize':(12,12)})
plt.xticks(rotation=90)
print(fig1)

In [None]:
train.shape

## Analyse TARGET

In [None]:
train.TARGET.value_counts(normalize= True)

In [None]:
train['TARGET'].plot.hist(title = 'target')

## Analyse des corrélations

In [None]:
mat_corr = train.corr()
ax = sns.heatmap(abs(mat_corr) , annot = True , cmap = 'coolwarm')
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 18)
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 18)
ax

In [None]:
correlations = train.corr()['TARGET'].sort_values()
print('Most Positive Correlations:\n', correlations.tail(8))
print('\nMost Negative Correlations:\n', correlations.head(8))

In [None]:
pos_corr = correlations.tail(8)
df_pos = train[pos_corr.index]
df_pos.hist()

In [None]:
neg_corr = correlations.tail(8)
df_pos = train[neg_corr.index]
df_pos.hist()

# Preprocessing

## Split des données

In [None]:
y_columns = ['TARGET']
X_id = train.drop(y_columns, axis=1)
print(X_id.shape)
y = train[y_columns]
print(y.shape)

In [None]:
X_id = pd.get_dummies(X_id)

In [None]:
X = X_id.drop(['SK_ID_CURR'],axis=1).reset_index(drop=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y)

## Création pipelines

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

In [None]:
categorical_pipeline = Pipeline(steps=[("impute", SimpleImputer(strategy="most_frequent"))])

In [None]:
numeric_pipeline = Pipeline(steps=[("impute", SimpleImputer(strategy="median")), 
           ("scale", RobustScaler())])

In [None]:
cat_cols = X_train.select_dtypes(exclude="number").columns
num_cols = X_train.select_dtypes(exclude=["bool_","object_"]).columns

In [None]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(transformers=[("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),])

In [None]:
X_train_pro = full_processor.fit_transform(X_train)

In [None]:
data = pd.DataFrame(data = X_train_pro, columns = X.columns)

In [None]:
data["SK_ID_CURR"] = X_id['SK_ID_CURR']

In [None]:
data.to_csv('data.csv', index=False)

In [None]:
X_test_pro = full_processor.transform(X_test)

## Resampling

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
print(X_train_pro.shape)
print(y_train.shape)

In [None]:
y_train_imb = np.ravel(y_train)

In [None]:
smt = SMOTE()
X_train_imp , y_train_imp = smt.fit_resample(X_train_pro,y_train)

In [None]:
print(X_train_imp.shape)
print(y_train_imp.shape)

In [None]:
y_train_imp = np.ravel(y_train_imp)

## Récupérer les noms des features

In [None]:
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [None]:
names = get_feature_names(full_processor)

# Modélisation

## Baseline

In [None]:
from sklearn import dummy 
from sklearn import metrics

In [None]:
my_dummy = dummy.DummyClassifier(strategy = 'most_frequent')

In [None]:
my_dummy.fit(X = X_train_imp, y = y_train_imp)

In [None]:
y_prob_dummy = (my_dummy.predict_proba(X_test_pro)[:,1]).astype('float16')

In [None]:
fpr_dummy, tpr_dummy, th_dummy = metrics.roc_curve(y_test, y_prob_dummy)
roc_auc_dummy = metrics.auc(fpr_dummy, tpr_dummy)
print(roc_auc_dummy)

In [None]:
sns.set(rc={'figure.figsize':(4,8)})
plt.plot(fpr_dummy, tpr_dummy ,linestyle='--', label = "Dummy AUC %0.2f" % roc_auc_dummy)

In [None]:
cm = metrics.confusion_matrix(y_true= y_test, y_pred = (y_prob_dummy >= 0.5).astype(int) )
disp = metrics.ConfusionMatrixDisplay(confusion_matrix = cm)
sns.set(rc={'figure.figsize':(4,8)})
disp.plot()

## Régression Logistique

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, roc_auc_score, roc_curve, confusion_matrix, recall_score
from datetime import datetime

In [None]:
log_reg = LogisticRegression(solver = 'liblinear')

### Imbalanced

In [None]:
log_reg.fit(X_train_pro, y_train_imb)

In [None]:
y_prob_log = (log_reg.predict_proba(X_test_pro)[:,1]).astype('float16')

In [None]:
y_pred_log = log_reg.predict(X_test_pro)

In [None]:
sns.histplot(y_prob_log)

In [None]:
fpr_log, tpr_log, th_log = metrics.roc_curve(y_test, y_prob_log)
roc_auc_log = metrics.auc(fpr_log, tpr_log)
print('ROC_AUC:', roc_auc_log)
accuracy_score_log = accuracy_score(y_test, y_pred_log)
print('Accuracy_score:', accuracy_score_log)
recall_log = recall_score(y_test, y_pred_log)
print('Recall:' , recall_log)
f1_log = f1_score(y_test, y_pred_log)
print('f1_score:' , f1_log)

In [None]:
plt.plot(fpr_log, tpr_log ,linestyle='--', label = "Log_grid AUC %0.2f" % roc_auc_log)
plt.plot(fpr_dummy, tpr_dummy ,linestyle='-', label = "Dummy AUC %0.2f" % roc_auc_dummy)

In [None]:
cm = metrics.confusion_matrix(y_true=y_test, y_pred = (y_prob_log >= 0.5).astype(int) )
disp = metrics.ConfusionMatrixDisplay(confusion_matrix = cm)
sns.set(rc={'figure.figsize':(4,8)})
disp.plot()

### Balanced

In [None]:
start_time = datetime.now()
log_reg.fit(X_train_imp, y_train_imp)
end_time = datetime.now()
log_time = end_time - start_time

In [None]:
y_prob_log = (log_reg.predict_proba(X_test_pro)[:,1]).astype('float16')

In [None]:
y_pred_log = log_reg.predict(X_test_pro)

In [None]:
sns.histplot(y_prob_log)

In [None]:
fpr_log, tpr_log, th_log = metrics.roc_curve(y_test, y_prob_log)
roc_auc_log = metrics.auc(fpr_log, tpr_log)
print('ROC_AUC:', roc_auc_log)
accuracy_score_log = accuracy_score(y_test, y_pred_log)
print('Accuracy_score:', accuracy_score_log)
recall_log = recall_score(y_test, y_pred_log)
print('Recall:' , recall_log)
f1_log = f1_score(y_test, y_pred_log)
print('f1_score:' , f1_log)

In [None]:
plt.plot(fpr_log, tpr_log ,linestyle='--', label = "Log_grid AUC %0.2f" % roc_auc_log)
plt.plot(fpr_dummy, tpr_dummy ,linestyle='-', label = "Dummy AUC %0.2f" % roc_auc_dummy)

In [None]:
cm = metrics.confusion_matrix(y_true=y_test, y_pred = (y_prob_log >= 0.5).astype(int) )
disp = metrics.ConfusionMatrixDisplay(confusion_matrix = cm)
sns.set(rc={'figure.figsize':(4,8)})
disp.plot()

### Feature importance global

In [None]:
feature_imp = pd.DataFrame(names, columns = ['Feature'])
feature_imp['importance'] = log_reg.coef_[0]
feature_imp.sort_values(by = 'importance', ascending = False, inplace = True)
top = feature_imp.head(10)
down = feature_imp.tail(10)

In [None]:
df_imp = pd.concat([top, down])

In [None]:
df_imp.plot.barh(x = 'Feature' , y = 'importance')

### Feature importance locale

In [None]:
import lime
from lime import lime_tabular

In [None]:
explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train_imp),
    feature_names= names,
    class_names=['0', '1'],
    mode='classification')

In [None]:
exp = explainer.explain_instance(
    data_row=X_train_imp[7], 
    predict_fn=log_reg.predict_proba
)

exp.show_in_notebook(show_table=True)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
start_time = datetime.now()
rf.fit(X_train_imp, y_train_imp)
end_time = datetime.now()
rf_time = end_time - start_time

In [None]:
y_prob_rf = (rf.predict_proba(X_test_pro)[:,1]).astype('float16')

In [None]:
y_pred_rf = rf.predict(X_test_pro)

In [None]:
sns.histplot(y_prob_rf)

In [None]:
fpr_rf, tpr_rf, th_rf = metrics.roc_curve(y_test, y_prob_rf)
roc_auc_rf = metrics.auc(fpr_rf, tpr_rf)
print('ROC_auc' , roc_auc_rf)
accuracy_score_rf = accuracy_score(y_test, y_pred_rf)
print('Accuracy_score:', accuracy_score_rf)
recall_rf = recall_score(y_test, y_pred_rf)
print('Recall:' , recall_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print('f1_score:' , f1_rf)

In [None]:
plt.plot(fpr_rf, tpr_rf ,linestyle='--', label = "RF_grid AUC %0.2f" % roc_auc_rf)
plt.plot(fpr_log, tpr_log ,linestyle='--', label = "Log_grid AUC %0.2f" % roc_auc_log)
plt.plot(fpr_dummy, tpr_dummy ,linestyle='-', label = "Dummy AUC %0.2f" % roc_auc_dummy)
plt.legend()

In [None]:
cm = metrics.confusion_matrix(y_true=y_test, y_pred = (y_prob_rf >= 0.5).astype(int) )
disp = metrics.ConfusionMatrixDisplay(confusion_matrix = cm)
sns.set(rc={'figure.figsize':(4,8)})
disp.plot()

### Feature importance global

In [None]:
importance = rf.feature_importances_
fi_rf = pd.concat((pd.DataFrame(names, columns = ['Variables']), 
                      pd.DataFrame(importance, columns = ['Importance'])), axis = 1).sort_values(by='Importance', ascending = False)

In [None]:
plt.figure(figsize=(8,8))
plt.title('Rf - Top 10 Features')
sns.barplot(y = fi_rf['Variables'].head(10),
            x = fi_rf['Importance'].head(10))
plt.show()

### Feature importance locale

In [None]:
exp = explainer.explain_instance(
    data_row=X_train_imp[7], 
    predict_fn=rf.predict_proba
)

exp.show_in_notebook(show_table=True)

# Xgboost

In [None]:
import xgboost as xgb

In [None]:
xgb = xgb.XGBClassifier(use_label_encoder=False , eval_metric = 'logloss')

In [None]:
start_time = datetime.now()
xgb.fit(X_train_imp, y_train_imp)
end_time = datetime.now()
xgb_time = end_time - start_time

In [None]:
y_prob_xgb = (xgb.predict_proba(X_test_pro)[:,1]).astype('float16')

In [None]:
y_pred_xgb = xgb.predict(X_test_pro)

In [None]:
sns.histplot(y_prob_xgb)

In [None]:
fpr_xgb, tpr_xgb, th_xgb = metrics.roc_curve(y_test, y_prob_xgb)
roc_auc_xgb = metrics.auc(fpr_xgb, tpr_xgb)
print('ROC_auc' , roc_auc_xgb)
accuracy_score_xgb = accuracy_score(y_test, y_pred_xgb)
print('Accuracy_score:', accuracy_score_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
print('Recall:' , recall_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print('f1_score:' , f1_xgb)

In [None]:
plt.plot(fpr_xgb, tpr_xgb ,linestyle='--', label = "XGB_grid AUC %0.2f" % roc_auc_xgb)
plt.plot(fpr_rf, tpr_rf ,linestyle='--', label = "RF_grid AUC %0.2f" % roc_auc_rf)
plt.plot(fpr_log, tpr_log ,linestyle='--', label = "Log_grid AUC %0.2f" % roc_auc_log)
plt.plot(fpr_dummy, tpr_dummy ,linestyle='-', label = "Dummy AUC %0.2f" % roc_auc_dummy)
plt.legend()

In [None]:
cm = metrics.confusion_matrix(y_true=y_test, y_pred = (y_prob_xgb >= 0.5).astype(int) )
disp = metrics.ConfusionMatrixDisplay(confusion_matrix = cm)
sns.set(rc={'figure.figsize':(4,8)})
disp.plot()

## Feature importance global

In [None]:
importance = xgb.feature_importances_
fi_xgb = pd.concat((pd.DataFrame(names, columns = ['Variables']), 
                      pd.DataFrame(importance, columns = ['Importance'])), axis = 1).sort_values(by='Importance', ascending = False)

In [None]:
plt.figure(figsize=(8,8))
plt.title('XGB - Top 10 Features')
sns.barplot(y = fi_xgb['Variables'].head(10),
            x = fi_xgb['Importance'].head(10))
plt.show()

## Feature importance locale

In [None]:
exp = explainer.explain_instance(
    data_row=X_train_imp[7], 
    predict_fn=xgb.predict_proba
)

exp.show_in_notebook(show_table=True)

# Comparaison des modèles

In [None]:
df_comparaison = pd.DataFrame({'Roc_auc' : [roc_auc_log,roc_auc_rf,roc_auc_xgb],
                               'Accuracy_score' :[accuracy_score_log,accuracy_score_rf,accuracy_score_xgb],
                               'Recall_score' : [recall_log,recall_rf,recall_xgb],
                              'F1_score' :[f1_log,f1_rf,f1_xgb],
                              'Duration' : [log_time.seconds ,rf_time.seconds,xgb_time.seconds ]}, 
                              index = ['log','RF', 'XGB'])

In [None]:
df_comparaison

In [None]:
df_comparaison.plot.bar(subplots=True)

# Modèle retenu

In [None]:
def customScore(y_true, y_pred, fn_value=5, fp_value=10):

    cm = confusion_matrix(y_true,y_pred)
    
    tp = cm[0, 0]
    fp = cm[1, 0]
    fn = cm[0, 1]
    tn = cm[1, 1]
    
    return  fp*fp_value + fn*fn_value  

my_scorer = metrics.make_scorer(customScore, greater_is_better=False)

## Log Reg

In [None]:
param_grid_log_reg = {'C': [0.01, 0.1, 1, 10, 100, 500 , 1000]}
grid_log_reg=GridSearchCV(LogisticRegression(solver = 'liblinear', max_iter= 3000),param_grid_log_reg ,scoring = my_scorer , cv = 4)
grid_log_reg.fit(X_train_imp,y_train_imp)

In [None]:
print(grid_log_reg.best_params_) 

In [None]:
grid_log_reg = LogisticRegression(solver = 'liblinear', max_iter= 3000 , C = 500)

In [None]:
grid_log_reg.fit(X_train_imp,y_train_imp)

In [None]:
grid_predictions_log_reg = grid_log_reg.predict(X_test_pro) 

In [None]:
grid_proba_log_reg = (grid_log_reg.predict_proba(X_test_pro)[:,1]).astype('float16')

In [None]:
sns.histplot(grid_proba_log_reg)

In [None]:
fpr_grid_log_reg, tpr_grid_log_reg, th_grid_log_reg = metrics.roc_curve(y_test, grid_proba_log_reg)
roc_auc_grid_log_reg = metrics.auc(fpr_grid_log_reg, tpr_grid_log_reg)
print('ROC_auc' , roc_auc_grid_log_reg)
accuracy_score_grid_log_reg = accuracy_score(y_test, grid_predictions_log_reg)
print('Accuracy_score:', accuracy_score_grid_log_reg)
recall_grid_log_reg = recall_score(y_test, grid_predictions_log_reg)
print('Recall:' , recall_grid_log_reg)
f1_grid_log_reg = f1_score(y_test, grid_predictions_log_reg)
print('f1_score:' , f1_grid_log_reg)

In [None]:
plt.plot(fpr_grid_log_reg, tpr_grid_log_reg ,linestyle='--', label = "Grid_log_reg_grid AUC %0.2f" % roc_auc_grid_log_reg)
plt.plot(fpr_xgb, tpr_xgb ,linestyle='--', label = "XGB_grid AUC %0.2f" % roc_auc_xgb)
plt.plot(fpr_rf, tpr_rf ,linestyle='--', label = "RF_grid AUC %0.2f" % roc_auc_rf)
plt.plot(fpr_log, tpr_log ,linestyle='--', label = "Log_grid AUC %0.2f" % roc_auc_log)
plt.plot(fpr_dummy, tpr_dummy ,linestyle='-', label = "Dummy AUC %0.2f" % roc_auc_dummy)
plt.legend()

In [None]:
cm = metrics.confusion_matrix(y_true=y_test, y_pred = (grid_proba_log_reg >= 0.5).astype(int) )
disp = metrics.ConfusionMatrixDisplay(confusion_matrix = cm)
sns.set(rc={'figure.figsize':(4,8)})
disp.plot()

### Feature importance globale

In [None]:
feature_imp = pd.DataFrame(names, columns = ['Feature'])
feature_imp['importance'] = grid_log_reg.coef_[0]
feature_imp.sort_values(by = 'importance', ascending = False, inplace = True)
top = feature_imp.head(10)
down = feature_imp.tail(10)

In [None]:
df_imp = pd.concat([top, down])

In [None]:
df_imp.plot.barh(x = 'Feature' , y = 'importance')

### Feature importance locale

In [None]:
explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train_imp),
    feature_names= names,
    class_names=['0', '1'],
    mode='classification')

In [None]:
exp = explainer.explain_instance(
    data_row=X_train_imp[7], 
    predict_fn=grid_log_reg.predict_proba
)

exp.show_in_notebook(show_table=True)

## Xgboost

In [None]:
import xgboost as xgb

In [None]:
parameters = {
    'n_estimators' : [100,500, 1000] }
grid_xgb = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False),
                        param_grid = parameters,
                        cv = 4,
                        scoring = my_scorer)
grid_xgb.fit(X_train_imp, y_train_imp)

In [None]:
print(grid_xgb.best_params_) 

In [None]:
grid_predictions_xgb = grid_xgb.predict(X_test_pro) 

In [None]:
grid_proba_xgb = (grid_xgb.predict_proba(X_test_pro)[:,1]).astype('float16')

In [None]:
sns.histplot(grid_proba_xgb)

In [None]:
fpr_grid_xgb, tpr_grid_xgb, th_grid_xgb = metrics.roc_curve(y_test, grid_proba_xgb)
roc_auc_grid_xgb = metrics.auc(fpr_grid_xgb, tpr_grid_xgb)
print('ROC_auc' , roc_auc_grid_xgb)
accuracy_score_grid_xgb = accuracy_score(y_test, grid_predictions_xgb)
print('Accuracy_score:', accuracy_score_grid_xgb)
recall_grid_xgb = recall_score(y_test, grid_predictions_xgb)
print('Recall:' , recall_grid_xgb)
f1_grid_xgb = f1_score(y_test, grid_predictions_xgb)
print('f1_score:' , f1_grid_xgb)

In [None]:
plt.plot(fpr_grid_xgb, tpr_grid_xgb ,linestyle='--', label = "Grid_xgb_grid AUC %0.2f" % roc_auc_grid_xgb)
plt.plot(fpr_grid_log_reg, tpr_grid_log_reg ,linestyle='--', label = "Grid_log_reg_grid AUC %0.2f" % roc_auc_grid_log_reg)
plt.plot(fpr_xgb, tpr_xgb ,linestyle='--', label = "XGB_grid AUC %0.2f" % roc_auc_xgb)
plt.plot(fpr_rf, tpr_rf ,linestyle='--', label = "RF_grid AUC %0.2f" % roc_auc_rf)
plt.plot(fpr_log, tpr_log ,linestyle='--', label = "Log_grid AUC %0.2f" % roc_auc_log)
plt.plot(fpr_dummy, tpr_dummy ,linestyle='-', label = "Dummy AUC %0.2f" % roc_auc_dummy)
plt.legend()

In [None]:
cm = metrics.confusion_matrix(y_true=y_test, y_pred = (grid_proba_xgb >= 0.5).astype(int) )
disp = metrics.ConfusionMatrixDisplay(confusion_matrix = cm)
sns.set(rc={'figure.figsize':(4,8)})
disp.plot()

## Préparation au déploiement

In [None]:
import pickle

In [None]:
pickle.dump(grid_log_reg, open('model_credit.pkl',  'wb'))