# Problem description (and imports)

In [40]:
#EDA imports
import pandas as pd 
import numpy as np
import seaborn as sns

#Imbalancing imports
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

#Preprocessing & model selection imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

# model imports
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC


disponible à ce lien : https://archive-beta.ics.uci.edu/dataset/350/default+of+credit+card+clients

In [2]:
# Attribute information
# This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. 
# This study reviewed the literature and used the following 23 variables as explanatory variables:
    
# X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.
# X2: Gender (1 = male; 2 = female).
# X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
# X4: Marital status (1 = married; 2 = single; 3 = others).
# X5: Age (year).
# X6 - X11: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: 
# X6 = the repayment status in September, 2005; 
# X7 = the repayment status in August, 2005; . . .;
# X11 = the repayment status in April, 2005. 

# The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 
# 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.

# X12-X17: Amount of bill statement (NT dollar). 
# X12 = amount of bill statement in September, 2005; 
# X13 = amount of bill statement in August, 2005; . . .; 
# X17 = amount of bill statement in April, 2005. 

# X18-X23: Amount of previous payment (NT dollar). 
# X18 = amount paid in September, 2005; 
# X19 = amount paid in August, 2005; . . .;
# X23 = amount paid in April, 2005.

In [3]:
path = "data/default_credit.xls"

In [4]:
df = pd.read_excel(path,header=1)
df.columns = df.columns.str.lower()
df.rename(columns={"default payment next month":"default_payment_next_month"},inplace=True)

# EDA & Data cleaning

In [5]:
df.set_index("id",inplace=True)
df.rename(columns={"pay_0" : "pay_1"},inplace=True)

In [6]:
df.head(3)

Unnamed: 0_level_0,limit_bal,sex,education,marriage,age,pay_1,pay_2,pay_3,pay_4,pay_5,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0


In [7]:
df.info() ## ==> only numeric (int) values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   limit_bal                   30000 non-null  int64
 1   sex                         30000 non-null  int64
 2   education                   30000 non-null  int64
 3   marriage                    30000 non-null  int64
 4   age                         30000 non-null  int64
 5   pay_1                       30000 non-null  int64
 6   pay_2                       30000 non-null  int64
 7   pay_3                       30000 non-null  int64
 8   pay_4                       30000 non-null  int64
 9   pay_5                       30000 non-null  int64
 10  pay_6                       30000 non-null  int64
 11  bill_amt1                   30000 non-null  int64
 12  bill_amt2                   30000 non-null  int64
 13  bill_amt3                   30000 non-null  int64
 14  bill_a

Context
It is important that credit card companies are able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.

In [8]:
df.isnull().sum() ## ==> No null values

limit_bal                     0
sex                           0
education                     0
marriage                      0
age                           0
pay_1                         0
pay_2                         0
pay_3                         0
pay_4                         0
pay_5                         0
pay_6                         0
bill_amt1                     0
bill_amt2                     0
bill_amt3                     0
bill_amt4                     0
bill_amt5                     0
bill_amt6                     0
pay_amt1                      0
pay_amt2                      0
pay_amt3                      0
pay_amt4                      0
pay_amt5                      0
pay_amt6                      0
default_payment_next_month    0
dtype: int64

In [9]:
df.duplicated().sum() ## ==> 35 duplicates to drop
df.drop_duplicates(inplace=True)

In [10]:
df.pay_1.value_counts().head()
# ==> a quick view shows us that many "pay" column displays a "-2" value despite the description (see above)
# tells us the best rating is "-1". For this reason, we chose to convert the -2 into -1 and, in case,
# if there's a value above 9, we're replacing it with 9

 0    14737
-1     5682
 1     3667
-2     2750
 2     2666
Name: pay_1, dtype: int64

In [11]:
df.pay_1 = df.pay_1.apply(lambda x: -1 if x == -2 else x)
df.pay_2 = df.pay_2.apply(lambda x: -1 if x == -2 else x)
df.pay_3 = df.pay_3.apply(lambda x: -1 if x == -2 else x)
df.pay_4 = df.pay_4.apply(lambda x: -1 if x == -2 else x)
df.pay_5 = df.pay_5.apply(lambda x: -1 if x == -2 else x)
df.pay_6 = df.pay_6.apply(lambda x: -1 if x == -2 else x)

In [12]:
# We face the same issue with education and marriage
df.education.value_counts(), df.marriage.value_counts()

(2    14019
 1    10563
 3     4915
 5      280
 4      123
 6       51
 0       14
 Name: education, dtype: int64,
 2    15945
 1    13643
 3      323
 0       54
 Name: marriage, dtype: int64)

In [13]:

# education is supposed to be restricted between 1 and 4 but we have 7 differents values
# to dig further but in absence of info, we chose to transform all "irrelevant" values into 4 (meaning "other")

# marriage is supposed to be 1,2 or 3 (for others). We choose to replace the "0" with 3 for the same reasons.

df.marriage = df.marriage.apply(lambda x: 3 if x == 0 else x)
df.education = df.education.map({0:4 , 1:1, 2:2, 3:3, 4:4 , 5:4 , 6:4})

In [14]:
# Creating X and y before
X = df.drop(columns="default_payment_next_month")
y = df.default_payment_next_month

In [15]:
y.value_counts(normalize=False)

0    23335
1     6630
Name: default_payment_next_month, dtype: int64

In [16]:
y.value_counts(normalize=True)

0    0.778742
1    0.221258
Name: default_payment_next_month, dtype: float64

In [17]:
# ==> Clean dataset but we have to deal the imbalancing

# Dealing with the imbalanced dataset

In [18]:
# The idea, here, is to both apply over and undersampling in order to limit the potential overfitting (oversample)
# and the loss of informations (undersample)

In [19]:
# instantiating over and under sampler
over = RandomOverSampler(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.8)
# first performing oversampling to minority class
X_over, y_over = over.fit_resample(X, y)
print(f"Oversampled: {Counter(y_over)}")

Oversampled: Counter({0: 23335, 1: 11667})


In [20]:
# now to comine under sampling 
X_combined_sampling, y_combined_sampling = under.fit_resample(X_over, y_over)
print(f"Combined Random Sampling: {Counter(y_combined_sampling)}")

Combined Random Sampling: Counter({0: 14583, 1: 11667})


In [21]:
y_combined_sampling.value_counts(normalize=True)

0    0.555543
1    0.444457
Name: default_payment_next_month, dtype: float64

In [22]:
# ==> We went from a 78/22 imbalanced dataset to a 55/45 splitted resampled dataset

# Preprocessing & model selection

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_combined_sampling,y_combined_sampling,
                                                    test_size=0.3,random_state=42,stratify=y_combined_sampling)

# Addind stratify to be sure to maintain a 55/45 (and not less) ratio in our subsets

## Creating our preprocessor

In [24]:
# We are going to create a preprocessor pipeline with different scaler because our features have differents values
# small_values include sex, education, marriage, and pay from 0 to 6. 
# As a reminder, no missing values so no need to impute

small_values = ["sex","education", "marriage", "pay_1","pay_2","pay_3","pay_4","pay_5","pay_6"]
age_column = ["age"]
limit_column = ["limit_bal"]
bill_column = ["bill_amt1", "bill_amt2", "bill_amt3", "bill_amt4","bill_amt5","bill_amt6"]
pay_amount_column = ["pay_amt1","pay_amt2","pay_amt3","pay_amt4","pay_amt5","pay_amt6"]

In [25]:
num_small_values = make_pipeline(RobustScaler())
num_age = make_pipeline(RobustScaler())
num_limit = make_pipeline(RobustScaler())
num_bill = make_pipeline(RobustScaler())
num_pay_amount = make_pipeline(RobustScaler())

In [26]:
preproc = make_column_transformer((num_small_values,small_values),
                                        (num_age, age_column),
                                        (num_limit, limit_column),
                                        (num_bill, bill_column),
                                        (num_pay_amount, pay_amount_column),
                                        remainder='passthrough')

In [27]:
preproc

## Testing various model through GridSearch cross validation

In [28]:
# At this stage, we consider every feature is important as we have no business insights
# Hence, we decide not to drop any features.
# We're choosing the f1 as scoring metric as we have now a slightly imbalanced dataset that derived from a more
# heavilt imbalanced dataset

### Logistic Regression

In [66]:
model_logistic = make_pipeline(preproc,LogisticRegression())
model_logistic.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['sex', 'education', 'marriage', 'pay_1',
                                     'pay_2', 'pay_3', 'pay_4', 'pay_5',
                                     'pay_6']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['age']),
                                   ('pipeline-3',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['limit_bal']),
                                  

In [62]:
logistic_params = {"logisticregression__C" : [0.1,0.5,1,5],
                  "logisticregression__solver" : ["saga","lbfgs"],
                  "logisticregression__max_iter" : [1000]}

grid_logistic = GridSearchCV(model_logistic,param_grid=logistic_params,scoring="f1",n_jobs=-1,cv=5)

grid_logistic.fit(X_train,y_train)

In [67]:
best_logistic_model = grid_logistic.best_estimator_
best_logistic_model.score(X_test,y_test)

0.7127619047619047

### KNN

In [69]:
model_knn = make_pipeline(preproc,KNeighborsClassifier())
model_knn.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['sex', 'education', 'marriage', 'pay_1',
                                     'pay_2', 'pay_3', 'pay_4', 'pay_5',
                                     'pay_6']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['age']),
                                   ('pipeline-3',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['limit_bal']),
                                  

In [70]:
knn_params = {"kneighborsclassifier__n_neighbors" : [2,5,8,10,12,15],
                  "kneighborsclassifier__leaf_size" : [20,30,50,60]}

grid_knn = GridSearchCV(model_knn,param_grid=knn_params,scoring="f1",n_jobs=-1,cv=5)

grid_knn.fit(X_train,y_train)

In [71]:
best_knn_model = grid_knn.best_estimator_
best_knn_model.score(X_test,y_test)

0.7041269841269842

### RandomForest

In [76]:
model_randomforest = make_pipeline(preproc,RandomForestClassifier())
model_randomforest.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['sex', 'education', 'marriage', 'pay_1',
                                     'pay_2', 'pay_3', 'pay_4', 'pay_5',
                                     'pay_6']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['age']),
                                   ('pipeline-3',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['limit_bal']),
                                  

In [81]:
randomforest_params = {'randomforestclassifier__max_features': ["sqrt",None],
                    'randomforestclassifier__min_samples_leaf': [1,3, 5],
                    'randomforestclassifier__n_estimators': [100, 500, 1000]}

grid_randomforest = GridSearchCV(model_randomforest,param_grid=randomforest_params,scoring="f1",n_jobs=-1,cv=5)

grid_randomforest.fit(X_train,y_train)



In [82]:
best_randomforest_model = grid_randomforest.best_estimator_
best_randomforest_model.score(X_test,y_test)

0.8458412698412698

### GradientBoosting

In [84]:
model_gradient = make_pipeline(preproc,GradientBoostingClassifier())
model_gradient.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['sex', 'education', 'marriage', 'pay_1',
                                     'pay_2', 'pay_3', 'pay_4', 'pay_5',
                                     'pay_6']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['age']),
                                   ('pipeline-3',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['limit_bal']),
                                  

In [86]:
gradient_params = {'gradientboostingclassifier__n_estimators': [50,150],
                    "gradientboostingclassifier__loss": ["log_loss",  "exponential"],
                    'gradientboostingclassifier__n_estimators': [100, 500]}

grid_gradient = GridSearchCV(model_gradient,param_grid=gradient_params,scoring="f1",n_jobs=-1,cv=5)

grid_gradient.fit(X_train,y_train)

In [87]:
best_gradientboosting_model = grid_gradient.best_estimator_
best_gradientboosting_model.score(X_test,y_test)

0.7490793650793651

### AdaBoost

In [88]:
model_adaboost = make_pipeline(preproc,AdaBoostClassifier())
model_adaboost.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['sex', 'education', 'marriage', 'pay_1',
                                     'pay_2', 'pay_3', 'pay_4', 'pay_5',
                                     'pay_6']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['age']),
                                   ('pipeline-3',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['limit_bal']),
                                  

In [89]:
adaboost_params = {'adaboostclassifier__learning_rate': [0.01,0.1,1,10],
                    "adaboostclassifier__n_estimators": [50,100,500]}

grid_adaboost = GridSearchCV(model_adaboost,param_grid=adaboost_params,scoring="f1",n_jobs=-1,cv=5)

grid_adaboost.fit(X_train,y_train)

In [90]:
best_adaboost_model = grid_adaboost.best_estimator_
best_adaboost_model.score(X_test,y_test)

0.7243174603174604

### XGBoost

In [91]:
model_xgboost = make_pipeline(preproc,XGBClassifier())
model_xgboost.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['sex', 'education', 'marriage', 'pay_1',
                                     'pay_2', 'pay_3', 'pay_4', 'pay_5',
                                     'pay_6']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['age']),
                                   ('pipeline-3',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['limit_bal']),
                                  

In [94]:
xgb_params = {'xgbclassifier__learning_rate': [0.1,0.3,0.5],
            "xgbclassifier__max_depth": [4,6,8],
             "xgbclassifier__n_estimators" : [100,500]}

grid_xgb = GridSearchCV(model_xgboost,param_grid=xgb_params,scoring="f1",n_jobs=-1,cv=5)

grid_xgb.fit(X_train,y_train)

In [93]:
best_xgb_model = grid_xgb.best_estimator_
best_xgb_model.score(X_test,y_test)

0.811047619047619

### SVC

In [96]:
model_svc = make_pipeline(preproc,SVC())
model_svc.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['sex', 'education', 'marriage', 'pay_1',
                                     'pay_2', 'pay_3', 'pay_4', 'pay_5',
                                     'pay_6']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['age']),
                                   ('pipeline-3',
                                    Pipeline(steps=[('robustscaler',
                                                     RobustScaler())]),
                                    ['limit_bal']),
                                  

In [97]:
svc_params = {'svc__C': [0.1,1,10],
            "svc__kernel": ["rbf","linear"],
             "svc__class_weight" : [None,"balanced"]}

grid_svc = GridSearchCV(model_svc , param_grid=svc_params , scoring="f1" , n_jobs=-1 , cv=5)

grid_svc.fit(X_train,y_train)



In [99]:
best_svc_model = grid_svc.best_estimator_
best_svc_model.score(X_test,y_test)

0.72

# Ranking our fine-tuned models

In [106]:
models = [best_adaboost_model,best_gradientboosting_model,best_knn_model,best_logistic_model,
          best_randomforest_model,best_svc_model,best_xgb_model]

models_names = ["AdaBoost",
                "GradientBoosting",
                "KNN",
                "Logistic",
                "RandomForest",
                "SVC",
                "XGB"]

In [107]:
different_test_scores = []

for model in models:
    temp_score = model.score(X_test,y_test)
    different_test_scores.append(temp_score)
    
comparing_models = pd.DataFrame(list(zip(models_names, different_test_scores)),
                                                columns =['fine_tuned_model', 'test_score'])

round(comparing_models.sort_values(by = "test_score", ascending = False),2)

Unnamed: 0,fine_tuned_model,test_score
4,RandomForest,0.85
6,XGB,0.81
1,GradientBoosting,0.75
0,AdaBoost,0.72
5,SVC,0.72
3,Logistic,0.71
2,KNN,0.7


In [108]:
# ==> a quick GridSearch on RandomForest confirmed it was the most fitted model to our problem
# Overall, a 0.85 F1 score seems really good
# We could further GridSearch a RandomForest model but for computional reasons, we won't