In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

import allinone as aio

%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option("display.float_format", "{:.3f}".format)
plt.style.use('ggplot')

In [2]:
df = pd.read_csv('hmelq_clean.csv')

In [3]:
df.head(3)

Unnamed: 0,bad,loan,mortdue,value,reason,job,yoj,derog,delinq,clage,ninq,clno,debtinc
0,0,29.311,64.074,16.199,HomeImp,Other,28.0,0.0,0.0,18.711,0.0,14.0,13.321
1,0,19.771,112.857,16.569,DebtCon,Other,2.0,0.0,0.0,18.002,0.0,25.0,13.443
2,0,21.39,86.871,14.924,DebtCon,Other,5.0,0.0,0.0,15.992,2.0,9.0,11.077


### Dummy

In [4]:
dfCopy = df.copy()

dms = pd.get_dummies(dfCopy[['reason','job']])
Xn_ = dfCopy.drop(['reason','job'], axis=1)
dfDummy = pd.concat([Xn_,dms], axis=1)
dfDummy = dfDummy.drop(['reason_DebtCon'], axis=1)

In [5]:
dfDummy.head(3)

Unnamed: 0,bad,loan,mortdue,value,yoj,derog,delinq,clage,ninq,clno,debtinc,reason_HomeImp,job_Mgr,job_Office,job_Other,job_ProfEx,job_Sales,job_Self
0,0,29.311,64.074,16.199,28.0,0.0,0.0,18.711,0.0,14.0,13.321,1,0,0,1,0,0,0
1,0,19.771,112.857,16.569,2.0,0.0,0.0,18.002,0.0,25.0,13.443,0,0,0,1,0,0,0
2,0,21.39,86.871,14.924,5.0,0.0,0.0,15.992,2.0,9.0,11.077,0,0,0,1,0,0,0


## Model

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [35]:
y = dfDummy.bad
X = dfDummy.drop(['bad'], axis=1)
model = MLPClassifier()

In [9]:
aio.confusion(X, y, model)

Accuracy: 0.88408 | AUC: 0.83605


Unnamed: 0,pred_0,pred_1,precision,recall,f1-score,support
0,573,18,0.898,0.97,0.932,591.0
1,65,60,0.769,0.48,0.591,125.0


In [16]:
dfModel = pd.DataFrame(columns = ['cross-score', 'cross-train', 'train_score', 'test_score', 'precision', 'recall', 'f1-score', 'auc-roc', 'auc-pr'])

In [17]:
y = dfDummy.bad
X = dfDummy.drop(['bad'], axis=1)
model = LogisticRegression()

dfModel = dfModel.append(aio.modelframe(X, y, model, 'logistic'))

In [18]:
y = dfDummy.bad
X = dfDummy.drop(['bad'], axis=1)
model = DecisionTreeClassifier()

dfModel = dfModel.append(aio.modelframe(X, y, model, 'tree'))

In [19]:
y = dfDummy.bad
X = dfDummy.drop(['bad'], axis=1)
model = RandomForestClassifier()

dfModel = dfModel.append(aio.modelframe(X, y, model, 'forest'))

In [20]:
y = dfDummy.bad
X = dfDummy.drop(['bad'], axis=1)
model = MLPClassifier()

dfModel = dfModel.append(aio.modelframe(X, y, model, 'neural'))

In [21]:
dfModel

Unnamed: 0,cross-score,cross-train,train_score,test_score,precision,recall,f1-score,auc-roc,auc-pr
logistic,0.837,0.836,0.839,0.853,0.647,0.352,0.456,0.782,0.539
tree,0.863,0.852,1.0,0.86,0.603,0.584,0.593,0.751,0.63
forest,0.916,0.908,1.0,0.927,0.91,0.648,0.757,0.96,0.87
neural,0.852,0.845,0.862,0.884,0.862,0.4,0.546,0.825,0.657


### Parametreler

In [22]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

In [28]:
y = dfDummy.bad
X = dfDummy.drop(['bad'], axis=1)

neural = MLPClassifier(activation='logistic')
params = {'hidden_layer_sizes': [(100,20),(50,20)], 'solver': ['lbfgs', 'adam', 'sgd']}

In [29]:
k_fold = KFold(n_splits = 10, shuffle = True, random_state = 42)
gridcv_results = GridSearchCV(estimator = neural, param_grid = params, cv = k_fold, n_jobs = -1, verbose = 1).fit(X, y)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   52.7s finished


In [30]:
gridcv_results.best_params_

{'hidden_layer_sizes': (100, 20), 'solver': 'adam'}

In [26]:
model = MLPClassifier(hidden_layer_sizes=(20, 20), solver='lbfgs')

dfModel = dfModel.append(aio.modelframe(X, y, model, 'neural_p1'))

In [31]:
model = MLPClassifier(hidden_layer_sizes=(100, 20), solver='adam', activation='logistic')

dfModel = dfModel.append(aio.modelframe(X, y, model, 'neural_p2'))

In [32]:
dfModel

Unnamed: 0,cross-score,cross-train,train_score,test_score,precision,recall,f1-score,auc-roc,auc-pr
logistic,0.837,0.836,0.839,0.853,0.647,0.352,0.456,0.782,0.539
tree,0.863,0.852,1.0,0.86,0.603,0.584,0.593,0.751,0.63
forest,0.916,0.908,1.0,0.927,0.91,0.648,0.757,0.96,0.87
neural,0.852,0.845,0.862,0.884,0.862,0.4,0.546,0.825,0.657
neural_p1,0.846,0.836,0.845,0.853,0.628,0.392,0.483,0.797,0.56
neural_p2,0.872,0.85,0.885,0.883,0.747,0.496,0.596,0.83,0.638


## Sonuç

- Forest'a göre daha düşük performans gösterdi, ama ucu açık* parametre iyileştirmeleri yapılırsa daha iyi sonuç alınabilir. 
- Maliyet odaklı incelenirse iyi bir performans sergilemedi, tree ile aynı performansa sahip oldukları söylenebilir.
- Parametrik değişiklikler yapıldıktan sonra daha iyi sonuçlar alındı, ama çoklu parametre uygulamak istenirse aşırı zaman gerektirir.
    - hiddenlayersizes: (100, 20), solver: adam