In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('heloc_dataset_v1.csv')

# Zakodowanie zmiennej celu jako 1/0

In [4]:
data['RiskPerformance'] = np.where(data.RiskPerformance=='Bad',1,0)

In [5]:
data.head(2)

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,1,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,1,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0


In [6]:
data.shape

(10459, 24)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score

In [8]:
from sklearn.feature_selection import SelectKBest

# Podział na train/test

In [9]:
X_train, X_test, y_train, y_test = \
train_test_split(data.drop('RiskPerformance', axis =1), data.RiskPerformance, test_size=.33, random_state=42)

# Feature selection na podstawie ANOVA

In [10]:
seletor_f_classif = SelectKBest(k='all')
seletor_f_classif.fit_transform(X_train, y_train)

array([[ 70, 153,   7, ...,   1,   1,  89],
       [ 64, 103,   1, ...,   3,   2,  58],
       [ 83, 210,   3, ...,   2,   0,  40],
       ...,
       [ 87, 230,   1, ...,   2,   0,  42],
       [ 68, 132,   4, ...,   2,   0,  88],
       [ 70, 130,   5, ...,   1,   0,  75]], dtype=int64)

In [11]:
cols_f_classif = pd.DataFrame({
    'column':X_train.columns, 
    'score': seletor_f_classif.scores_, 
    'p_value':seletor_f_classif.pvalues_
}).sort_values('score', ascending = False)

In [12]:
column_list = cols_f_classif.query("score>10").column.values

# Podział na train/test wybranych kolumn

In [14]:
X_train, X_test, y_train, y_test = \
train_test_split(data[column_list], data.RiskPerformance, test_size=.33, random_state=42)

## Analiza różnych algorytmów

In [None]:
names = ["Nearest Neighbors", "Gaussian Process",
         "Decision Tree", "Random Forest","AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=4),
    RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [30]:
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)[:,1]
    y_pred2 = clf.predict(X_test)
    print(name)
    print(" AUC = ",round(roc_auc_score( y_test, y_pred)*100,2),
          " ACC = ", round(accuracy_score( y_test, y_pred2)*100,2), 
          " F1 = ", round(f1_score( y_test, y_pred2)*100,2), 
          " Precision = ", round(precision_score( y_test, y_pred2)*100,2), 
          " Recall = ", round(recall_score( y_test, y_pred2)*100,2))

Nearest Neighbors
 AUC =  66.87  ACC =  63.27  F1 =  65.34  Precision =  64.98  Recall =  65.7
Gaussian Process
 AUC =  49.53  ACC =  58.49  F1 =  57.67  Precision =  62.32  Recall =  53.66
Decision Tree
 AUC =  76.14  ACC =  70.16  F1 =  72.25  Precision =  70.84  Recall =  73.72
Random Forest
 AUC =  75.66  ACC =  69.7  F1 =  72.73  Precision =  69.16  Recall =  76.69
AdaBoost
 AUC =  78.48  ACC =  70.89  F1 =  73.14  Precision =  71.18  Recall =  75.21
Naive Bayes
 AUC =  73.22  ACC =  65.99  F1 =  62.61  Precision =  74.41  Recall =  54.04
QDA
 AUC =  73.75  ACC =  66.66  F1 =  65.9  Precision =  71.47  Recall =  61.13


In [15]:
from xgboost import XGBClassifier

In [17]:
clf  = XGBClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]
y_pred2 = clf.predict(X_test)
print('XGB')
print(" AUC = ",round(roc_auc_score( y_test, y_pred)*100,2),
      " ACC = ", round(accuracy_score( y_test, y_pred2)*100,2), 
      " F1 = ", round(f1_score( y_test, y_pred2)*100,2), 
      " Precision = ", round(precision_score( y_test, y_pred2)*100,2), 
      " Recall = ", round(recall_score( y_test, y_pred2)*100,2))

XGB
 AUC =  76.82  ACC =  70.22  F1 =  72.5  Precision =  70.61  Recall =  74.49


## Tuningowanie parametrów

In [23]:
for x in [100,150,175,200,225,250]:
    clf_win = AdaBoostClassifier(
        n_estimators=x,
        random_state=123
    )
    clf_win.fit(X_train, y_train)
    y_pred = clf_win.predict_proba(X_test)[:,1]
    y_pred2 = clf_win.predict(X_test)
    print("n_estimators = ",x,
          " AUC = ",round(roc_auc_score( y_test, y_pred)*100,2),
          " ACC = ", round(accuracy_score( y_test, y_pred2)*100,2), 
          " F1 = ", round(f1_score( y_test, y_pred2)*100,2), 
          " Precision = ", round(precision_score( y_test, y_pred2)*100,2), 
          " Recall = ", round(recall_score( y_test, y_pred2)*100,2))

n_estimators =  100  AUC =  78.49  ACC =  70.63  F1 =  72.82  Precision =  71.06  Recall =  74.66
n_estimators =  150  AUC =  78.56  ACC =  70.68  F1 =  73.06  Precision =  70.83  Recall =  75.43
n_estimators =  175  AUC =  78.54  ACC =  70.63  F1 =  72.96  Precision =  70.84  Recall =  75.21
n_estimators =  200  AUC =  78.53  ACC =  70.63  F1 =  72.9  Precision =  70.93  Recall =  74.99
n_estimators =  225  AUC =  78.37  ACC =  70.48  F1 =  72.78  Precision =  70.79  Recall =  74.88
n_estimators =  250  AUC =  78.28  ACC =  70.54  F1 =  72.83  Precision =  70.84  Recall =  74.93


In [26]:
for x in [0.23,0.24,0.25,0.26,.27,0.28,.29]:
    clf_win = AdaBoostClassifier(
        n_estimators=150,
        learning_rate=x,
        random_state=123
    )
    clf_win.fit(X_train, y_train)
    y_pred = clf_win.predict_proba(X_test)[:,1]
    y_pred2 = clf_win.predict(X_test)
    print("learning_rate = ",x,
          " AUC = ",round(roc_auc_score( y_test, y_pred)*100,2),
          " ACC = ", round(accuracy_score( y_test, y_pred2)*100,2), 
          " F1 = ", round(f1_score( y_test, y_pred2)*100,2), 
          " Precision = ", round(precision_score( y_test, y_pred2)*100,2), 
          " Recall = ", round(recall_score( y_test, y_pred2)*100,2))

learning_rate =  0.23  AUC =  78.82  ACC =  71.32  F1 =  73.7  Precision =  71.31  Recall =  76.25
learning_rate =  0.24  AUC =  78.94  ACC =  71.49  F1 =  73.84  Precision =  71.49  Recall =  76.36
learning_rate =  0.25  AUC =  78.94  ACC =  71.55  F1 =  73.87  Precision =  71.58  Recall =  76.31
learning_rate =  0.26  AUC =  78.98  ACC =  71.55  F1 =  73.87  Precision =  71.58  Recall =  76.31
learning_rate =  0.27  AUC =  78.97  ACC =  71.61  F1 =  73.91  Precision =  71.66  Recall =  76.31
learning_rate =  0.28  AUC =  78.98  ACC =  71.38  F1 =  73.68  Precision =  71.47  Recall =  76.03
learning_rate =  0.29  AUC =  78.86  ACC =  71.38  F1 =  73.72  Precision =  71.41  Recall =  76.2


In [27]:
for x in ['SAMME', 'SAMME.R']:
    clf_win = AdaBoostClassifier(
        n_estimators=150,
        learning_rate=0.27,
        algorithm=x,
        random_state=123
    )
    clf_win.fit(X_train, y_train)
    y_pred = clf_win.predict_proba(X_test)[:,1]
    y_pred2 = clf_win.predict(X_test)
    print("algorithm = ",x,
          " AUC = ",round(roc_auc_score( y_test, y_pred)*100,2),
          " ACC = ", round(accuracy_score( y_test, y_pred2)*100,2), 
          " F1 = ", round(f1_score( y_test, y_pred2)*100,2), 
          " Precision = ", round(precision_score( y_test, y_pred2)*100,2), 
          " Recall = ", round(recall_score( y_test, y_pred2)*100,2))

algorithm =  SAMME  AUC =  77.86  ACC =  70.74  F1 =  73.29  Precision =  70.61  Recall =  76.2
algorithm =  SAMME.R  AUC =  78.97  ACC =  71.61  F1 =  73.91  Precision =  71.66  Recall =  76.31


In [30]:
for x in [1,2,8,9,10]:
    clf_win = AdaBoostClassifier(
        base_estimator = DecisionTreeClassifier(
            max_depth=x,
            random_state=123
        ),
        n_estimators=150,
        learning_rate=0.27,
        algorithm='SAMME.R',
        random_state=123
    )
    clf_win.fit(X_train, y_train)
    y_pred = clf_win.predict_proba(X_test)[:,1]
    y_pred2 = clf_win.predict(X_test)
    print("max_depth = ",x,
          " AUC = ",round(roc_auc_score( y_test, y_pred)*100,2),
          " ACC = ", round(accuracy_score( y_test, y_pred2)*100,2), 
          " F1 = ", round(f1_score( y_test, y_pred2)*100,2), 
          " Precision = ", round(precision_score( y_test, y_pred2)*100,2), 
          " Recall = ", round(recall_score( y_test, y_pred2)*100,2))

max_depth =  1  AUC =  78.97  ACC =  71.61  F1 =  73.91  Precision =  71.66  Recall =  76.31
max_depth =  2  AUC =  77.9  ACC =  71.44  F1 =  73.75  Precision =  71.5  Recall =  76.14
max_depth =  8  AUC =  75.34  ACC =  68.57  F1 =  71.35  Precision =  68.65  Recall =  74.27
max_depth =  9  AUC =  75.47  ACC =  69.64  F1 =  72.46  Precision =  69.4  Recall =  75.81
max_depth =  10  AUC =  76.55  ACC =  69.81  F1 =  72.62  Precision =  69.55  Recall =  75.98


# Ostateczny model

In [14]:
clf_win = AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(
        max_depth=1,
        random_state=123
    ),
    n_estimators=150,
    learning_rate=0.27,
    algorithm='SAMME.R',
    random_state=123
)
clf_win.fit(X_train, y_train)
y_pred = clf_win.predict_proba(X_test)[:,1]
y_pred2 = clf_win.predict(X_test)
print(" AUC = ",round(roc_auc_score( y_test, y_pred)*100,2),
      " ACC = ", round(accuracy_score( y_test, y_pred2)*100,2), 
      " F1 = ", round(f1_score( y_test, y_pred2)*100,2), 
      " Precision = ", round(precision_score( y_test, y_pred2)*100,2), 
      " Recall = ", round(recall_score( y_test, y_pred2)*100,2))

 AUC =  79.01  ACC =  71.32  F1 =  73.56  Precision =  71.53  Recall =  75.7
