# Predicting Credit Default

In [None]:
# sudo pip install imblearn
# sudo pip install xgboost

In [None]:
import pandas as pd

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler #OneHotEncoder,
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
#LabelEncoder, label_binarize, StandardScaler, MinMaxScaler

from collections import defaultdict

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
#seed = 1234 # credit, age, duration
seed = 1234

In [None]:
# https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)

col_names = names = ["checking_account", "duration", "credit_history", "purpose", "credit_amount", 
                     "savings", "employment_since", "installment_rate", "status", "debtors_guarantors", 
                     "residence", "property", "age", "other_installments", "housing", 
                     "credits", "job", "dependents", "telephone", "foreign_worker", "credit"]

data_df = pd.read_csv("german.data",names = col_names, delimiter=' ')
data_df.head()

## Feature engineering

In [None]:
# Remap the target attribute: 1 - good credit, 0 - bad credit
data_df["credit"].replace([1,2], [1,0], inplace=True)

num_attr_names = ["duration", "credit_amount", "installment_rate", "residence", 
                  "age", "credits", "dependents"]

cat_attr_names = ["checking_account", "credit_history", "purpose", "savings", "employment_since", 
                  "status", "debtors_guarantors", "property", "other_installments", "housing", 
                  "job", "telephone", "foreign_worker"]

num_attr_norm = pd.DataFrame(StandardScaler().fit_transform(data_df[num_attr_names]),
                             columns=num_attr_names)

num_attr_norm.head()

In [None]:
dd = defaultdict(LabelEncoder)

cat_attr = data_df[cat_attr_names].apply(lambda col: dd[col.name].fit_transform(col))

cat_attr_dummy = pd.get_dummies(data_df[cat_attr_names])

cat_attr_dummy.head()

In [None]:
clean_df = pd.concat([cat_attr_dummy, num_attr_norm, data_df["credit"]], axis = 1)
clean_df.head()

In [None]:
X = clean_df.loc[:, clean_df.columns != "credit"]
y = clean_df["credit"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
y_train.value_counts()

In [None]:
oversample = SMOTE(random_state=seed)
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [None]:
y_train.value_counts()

## Model training

In [None]:
# change max_depth 15 -> 5 and look at feature importance
xgc = xgb.XGBClassifier(n_estimators=500, max_depth=15, base_score=0.5,
                        objective="binary:logistic", random_state=seed, use_label_encoder=False, eval_metric="logloss")
xgc.fit(X_train, y_train)

In [None]:
y_pred = xgc.predict(X_test)

In [None]:
print("Accuracy: %.2f" % accuracy_score(y_pred, y_test))
print("F1 score: %.2f" % f1_score(y_pred, y_test))

## Feature importance

In [None]:
xgb.plot_importance(xgc, importance_type="cover", max_num_features=10, show_values=False);