# German Credit Score

In [405]:
import pandas as pd
import sklearn.preprocessing as preprocessing
import numpy  as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, confusion_matrix
from sklearn import metrics
from lightgbm import LGBMRegressor as lgb
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from lightgbm import Dataset

In [406]:
df_raw = pd.read_csv('german_credit.csv')

In [407]:
def age_group(age):
    if age < 20:
        return 'Teenager'
    if age < 35:
        return 'Productive'
    if age < 50:
        return 'Mature'
    else :
         return 'Old'

In [408]:
df_raw['credit_per_person'] = df_raw['credit_amount']/df_raw['people_under_maintenance']
df_raw['age_group'] = df_raw['age'].apply(age_group)

In [409]:
# Create x, where x the 'scores' column's values as floats
x = df_raw[['credit_amount']].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_raw['credit_amount_norm'] = pd.DataFrame(x_scaled)
df_raw['age_norm']= pd.DataFrame(min_max_scaler.fit_transform(df_raw[['age']].values.astype(float)))
df_raw['duration_in_month']= pd.DataFrame(min_max_scaler.fit_transform(df_raw[['duration_in_month']].values.astype(float)))

In [410]:
df_raw['credit_amount_norm'].head(2)

0    0.050567
1    0.313690
Name: credit_amount_norm, dtype: float64

In [411]:
numeric_data = df.select_dtypes(include=[np.number]).columns

In [412]:
df_raw[numeric_data].head()

Unnamed: 0,default,duration_in_month,credit_amount,installment_as_income_perc,present_res_since,age,credits_this_bank,people_under_maintenance
0,0,0.029412,1169,4,4,67,2,1
1,1,0.647059,5951,2,2,22,1,1
2,0,0.117647,2096,2,3,49,1,2
3,0,0.558824,7882,2,4,45,1,2
4,1,0.294118,4870,3,4,53,2,2


In [413]:
df_prepare = df_raw

target = df_prepare['default']
temp = df_prepare.drop(['default','age','installment_as_income_perc','personal_status_sex','telephone'],axis=1)

In [414]:
dataset_dummies = temp.select_dtypes(exclude=['int','int64','float64'])
dataset_int = temp.select_dtypes(include = ['int','int64','float64'] )


dataset_dummies_2 = pd.get_dummies(dataset_dummies, drop_first=True)
df_model = pd.concat([dataset_int,dataset_dummies_2,target], axis = 1 )

x = df_model.iloc[:,0:len(df_model.columns)-1]
y = df_model.iloc[:,len(df_model.columns)-1]


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state = 42,stratify=y)



In [415]:
# Logistic_regression = LogisticRegression()
# Logistic_regression.fit(x_train,y_train)

# y_pred_train = Logistic_regression.predict(x_train)
# y_pred_test = Logistic_regression.predict(x_test)


In [416]:
# fpr_train, tpr_train, threshold = metrics.roc_curve(y_train,y_pred_train)
# fpr_test, tpr_test, threshold = metrics.roc_curve(y_test,y_pred_test)

In [417]:
# print('AUC train = {} '.format(metrics.auc(fpr_train,tpr_train)))
# print('AUC train = {} '.format(metrics.auc(fpr_test,tpr_test)))

In [418]:
import lightgbm as lgb

In [437]:
np.random.seed(42)
train_data = lgb.Dataset(x_train, label=y_train)
test_data = lgb.Dataset(x_test, label=y_test)

parameters = {
        'application': 'binary',
        'objective': 'binary',
#         'metric': 'auc',
#         'is_unbalance': 'true',
#         'boosting': 'gbdt',
#         'num_leaves': 31,
#         'feature_fraction': 0.5,
#         'bagging_fraction': 0.5,
#         'bagging_freq': 20,
#         'learning_rate': 0.05,
#         'verbose': 0
}

lgb_model = lgb.train(parameters,
                           train_data,
                           valid_sets=test_data,
                           num_boost_round=5000,
                           early_stopping_rounds=100,)

[1]	valid_0's binary_logloss: 0.589516
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.574456
[3]	valid_0's binary_logloss: 0.563965
[4]	valid_0's binary_logloss: 0.556608
[5]	valid_0's binary_logloss: 0.546945
[6]	valid_0's binary_logloss: 0.540002
[7]	valid_0's binary_logloss: 0.534554
[8]	valid_0's binary_logloss: 0.527825
[9]	valid_0's binary_logloss: 0.524085
[10]	valid_0's binary_logloss: 0.518984
[11]	valid_0's binary_logloss: 0.514458
[12]	valid_0's binary_logloss: 0.511866
[13]	valid_0's binary_logloss: 0.510332
[14]	valid_0's binary_logloss: 0.508028
[15]	valid_0's binary_logloss: 0.505226
[16]	valid_0's binary_logloss: 0.504503
[17]	valid_0's binary_logloss: 0.501054
[18]	valid_0's binary_logloss: 0.499007
[19]	valid_0's binary_logloss: 0.496053
[20]	valid_0's binary_logloss: 0.492248
[21]	valid_0's binary_logloss: 0.48934
[22]	valid_0's binary_logloss: 0.487261
[23]	valid_0's binary_logloss: 0.485397
[24]	valid_0's binary_loglos

In [441]:
lgb_model.predict(x_test)

array([0.82390337, 0.0456382 , 0.06807897, 0.1210668 , 0.06797202,
       0.61632445, 0.31558322, 0.71536904, 0.05916233, 0.40969789,
       0.08922149, 0.18118821, 0.49048458, 0.39062195, 0.66754465,
       0.09609829, 0.34053226, 0.58244136, 0.33476436, 0.13643635,
       0.6138181 , 0.02118807, 0.6177013 , 0.23123042, 0.09530238,
       0.17420204, 0.27416596, 0.76143911, 0.84522104, 0.04768855,
       0.33069857, 0.06665556, 0.13439864, 0.48425927, 0.10700956,
       0.19261631, 0.17336717, 0.17664356, 0.07367127, 0.55423196,
       0.07579877, 0.42543038, 0.33338217, 0.55265764, 0.20233679,
       0.20666201, 0.05469338, 0.68067575, 0.38187011, 0.76255834,
       0.07426612, 0.85258775, 0.6988466 , 0.39100406, 0.28011678,
       0.12827294, 0.53852012, 0.34038283, 0.35634669, 0.11326185,
       0.06126064, 0.16014436, 0.51751799, 0.79626156, 0.43628595,
       0.03806304, 0.45206975, 0.42372774, 0.02844899, 0.1047379 ,
       0.79819154, 0.01313472, 0.12109911, 0.13758535, 0.21446

In [442]:
print("AUC Train : {}".format(metrics.roc_auc_score(y_train,lgb_model.predict(x_train))))
print("AUC Test : {}".format(metrics.roc_auc_score(y_test,lgb_model.predict(x_test))))

AUC Train : 0.9783619047619048
AUC Test : 0.8067047619047619


In [443]:
lgb_model.predict(x_test.iloc[0])



array([0.82390337])

In [435]:
import pickle

In [373]:

# Saving model using pickle
pickle.dump(lgb_model, open('model_lgb.pkl','wb'))



In [374]:
# Loading model to compare the results
model_lgb = pickle.load( open('model_lgb.pkl','rb'))
print(model_lgb.predict(x_test.iloc[0]))

[0.76958709]
