# German Credit Score

In [129]:
import pandas as pd
import sklearn.preprocessing as preprocessing
import numpy  as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, confusion_matrix
from sklearn import metrics
from lightgbm import LGBMClassifier as lgb
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from lightgbm import Dataset

In [130]:
df_raw = pd.read_csv('german_credit.csv')

In [131]:
def age_group(age):
    if age < 20:
        return 'Teenager'
    if age < 35:
        return 'Productive'
    if age < 50:
        return 'Mature'
    else :
         return 'Old'

In [132]:
df_raw['credit_per_person'] = df_raw['credit_amount']/df_raw['people_under_maintenance']
df_raw['age_group'] = df_raw['age'].apply(age_group)

In [133]:
# Create x, where x the 'scores' column's values as floats
x = df_raw[['credit_amount']].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_raw['credit_amount_norm'] = pd.DataFrame(x_scaled)
df_raw['age_norm']= pd.DataFrame(min_max_scaler.fit_transform(df_raw[['age']].values.astype(float)))
df_raw['duration_in_month']= pd.DataFrame(min_max_scaler.fit_transform(df_raw[['duration_in_month']].values.astype(float)))

In [134]:
df_raw['credit_amount_norm'].head(2)

0    0.050567
1    0.313690
Name: credit_amount_norm, dtype: float64

In [135]:
numeric_data = df.select_dtypes(include=[np.number]).columns

In [136]:
df_raw[numeric_data].head()

Unnamed: 0,default,duration_in_month,credit_amount,installment_as_income_perc,present_res_since,age,credits_this_bank,people_under_maintenance
0,0,0.029412,1169,4,4,67,2,1
1,1,0.647059,5951,2,2,22,1,1
2,0,0.117647,2096,2,3,49,1,2
3,0,0.558824,7882,2,4,45,1,2
4,1,0.294118,4870,3,4,53,2,2


In [221]:
df_prepare = df_raw

target = df_prepare['default']
temp = df_prepare.drop(['default','credit_amount','age','installment_as_income_perc','personal_status_sex','telephone'],axis=1)

In [222]:
dataset_dummies = temp.select_dtypes(exclude=['int','int64','float64'])
dataset_int = temp.select_dtypes(include = ['int','int64','float64'] )


dataset_dummies_2 = pd.get_dummies(dataset_dummies, drop_first=True)
df_model = pd.concat([dataset_int,dataset_dummies_2,target], axis = 1 )

x = df_model.iloc[:,0:len(df_model.columns)-1]
y = df_model.iloc[:,len(df_model.columns)-1]


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state = 42,stratify=y)



In [223]:
# Logistic_regression = LogisticRegression()
# Logistic_regression.fit(x_train,y_train)

# y_pred_train = Logistic_regression.predict(x_train)
# y_pred_test = Logistic_regression.predict(x_test)


In [224]:
# fpr_train, tpr_train, threshold = metrics.roc_curve(y_train,y_pred_train)
# fpr_test, tpr_test, threshold = metrics.roc_curve(y_test,y_pred_test)

In [225]:
# print('AUC train = {} '.format(metrics.auc(fpr_train,tpr_train)))
# print('AUC train = {} '.format(metrics.auc(fpr_test,tpr_test)))

In [226]:
import lightgbm as lgb

In [227]:
np.random.seed(42)
train_data = lgb.Dataset(x_train, label=y_train)
test_data = lgb.Dataset(x_test, label=y_test)

parameters = {
        'application': 'binary',
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': 'true',
        'boosting': 'gbdt',
        'num_leaves': 31,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 20,
        'learning_rate': 0.05,
        'verbose': 0
}

lgb_model = lgb.train(parameters,
                           train_data,
                           valid_sets=test_data,
                           num_boost_round=5000,
                           early_stopping_rounds=100)

[1]	valid_0's auc: 0.729105
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.747429
[3]	valid_0's auc: 0.755581
[4]	valid_0's auc: 0.757371
[5]	valid_0's auc: 0.759543
[6]	valid_0's auc: 0.763695
[7]	valid_0's auc: 0.758705
[8]	valid_0's auc: 0.76819
[9]	valid_0's auc: 0.771695
[10]	valid_0's auc: 0.774743
[11]	valid_0's auc: 0.770476
[12]	valid_0's auc: 0.77021
[13]	valid_0's auc: 0.771505
[14]	valid_0's auc: 0.773638
[15]	valid_0's auc: 0.773486
[16]	valid_0's auc: 0.776914
[17]	valid_0's auc: 0.779733
[18]	valid_0's auc: 0.779429
[19]	valid_0's auc: 0.778819
[20]	valid_0's auc: 0.779352
[21]	valid_0's auc: 0.782857
[22]	valid_0's auc: 0.784457
[23]	valid_0's auc: 0.787048
[24]	valid_0's auc: 0.787886
[25]	valid_0's auc: 0.786514
[26]	valid_0's auc: 0.784152
[27]	valid_0's auc: 0.785371
[28]	valid_0's auc: 0.786362
[29]	valid_0's auc: 0.783924
[30]	valid_0's auc: 0.785295
[31]	valid_0's auc: 0.785067
[32]	valid_0's auc: 0.786133
[33]	valid_0's auc: 0

In [228]:
lgb_model.predict(x_test)

array([0.73672241, 0.10206547, 0.13792675, 0.37109607, 0.31594598,
       0.62834555, 0.48190296, 0.82199085, 0.18769573, 0.40960829,
       0.19451499, 0.2252872 , 0.74700976, 0.66907954, 0.80356982,
       0.21103454, 0.47106518, 0.79445115, 0.56560076, 0.24153328,
       0.79078361, 0.06770179, 0.75084096, 0.23679056, 0.39359045,
       0.56477367, 0.17181013, 0.70489048, 0.77217944, 0.34109945,
       0.42256787, 0.30911828, 0.29925131, 0.59461588, 0.21771415,
       0.40659097, 0.24494424, 0.28521574, 0.11952574, 0.84629622,
       0.07025587, 0.36065545, 0.80257671, 0.68821656, 0.55970854,
       0.37507096, 0.19954464, 0.87122561, 0.53126872, 0.68837432,
       0.17296249, 0.87242748, 0.64433643, 0.64627546, 0.70165627,
       0.24699489, 0.6049088 , 0.55124246, 0.58772007, 0.37409598,
       0.16806433, 0.59323725, 0.61717154, 0.63314479, 0.55219152,
       0.14305841, 0.25023585, 0.24768719, 0.07872482, 0.29621863,
       0.84738838, 0.08287122, 0.16812539, 0.35057756, 0.32888

In [235]:
print("AUC Train : {}".format(metrics.roc_auc_score(y_train,lgb_model.predict(x_train))))
print("AUC Test : {}".format(metrics.roc_auc_score(y_test,lgb_model.predict(x_test))))

AUC Train : 0.895525925925926
AUC Test : 0.811047619047619
