In [110]:
import numpy as np
import pylab as pl
import pandas as pd
import matplotlib.pyplot as plt
from scipy import interp
from sklearn import svm
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report,confusion_matrix, roc_curve, auc
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFromModel
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import ExtraTreesClassifier
%matplotlib inline

In [111]:
#import categorical/numerical data (.csv) with Pandas DataFrame
names = ['existingchecking', 'duration', 'credithistory', 'purpose', 'creditamount', 
         'savings', 'employmentsince', 'installmentrate', 'statussex', 'otherdebtors', 
         'residencesince', 'property', 'age', 'otherinstallmentplans', 'housing', 
         'existingcredits', 'job', 'peopleliable', 'telephone', 'foreignworker', 'classification']
data=pd.read_csv('german.data',sep='\s+',header=0)
data.columns=names
data.head()

Unnamed: 0,existingchecking,duration,credithistory,purpose,creditamount,savings,employmentsince,installmentrate,statussex,otherdebtors,...,property,age,otherinstallmentplans,housing,existingcredits,job,peopleliable,telephone,foreignworker,classification
0,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
1,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
2,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
3,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
4,A14,36,A32,A46,9055,A65,A73,2,A93,A101,...,A124,35,A143,A153,1,A172,2,A192,A201,1


In [112]:
#create quickaccess list with categorical variables labels
catvars = ['existingchecking', 'credithistory', 'purpose', 'savings', 'employmentsince',
           'statussex', 'otherdebtors', 'property', 'otherinstallmentplans', 'housing', 'job', 
           'telephone', 'foreignworker']
#create quickaccess list with numerical variables labels

In [113]:
#create quickaccess list with numerical variables labels
numvars = ['creditamount', 'duration', 'installmentrate', 'residencesince', 'age', 
           'existingcredits', 'peopleliable', 'classification']

In [114]:
# Binarize the y output for easier use of e.g. ROC curves -> 0 = 'bad' credit; 1 = 'good' credit
data.classification.replace([1,2], [1,0], inplace=True)

In [115]:
#Standardistion
numdata_std=pd.DataFrame(StandardScaler().fit_transform(data[numvars].drop(['classification'], axis=1)))
## MinMax Rescaling to [0,1]
numdata_minmax = pd.DataFrame(MinMaxScaler().fit_transform(data[numvars].drop(['classification'], axis=1)))
# One hot encoding
#create dummy variables for every category of every categorical variable
dummyvars = pd.get_dummies(data[catvars])


In [116]:
numdata_std.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.94886,2.247553,-0.869196,-0.764966,-1.192617,-0.703918,-0.428543
1,-0.417216,-0.740102,-0.869196,0.14156,1.190056,-0.703918,2.333487
2,1.633138,1.749611,-0.869196,1.048086,0.837067,-0.703918,2.333487
3,0.565792,0.255783,0.025064,1.048086,1.543044,1.028136,2.333487
4,2.048808,1.251668,-0.869196,1.048086,-0.045404,-0.703918,2.333487


In [117]:
numdata_minmax.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.31369,0.647059,0.333333,0.333333,0.053571,0.0,0.0
1,0.101574,0.117647,0.333333,0.666667,0.535714,0.0,1.0
2,0.419941,0.558824,0.333333,1.0,0.464286,0.0,1.0
3,0.254209,0.294118,0.666667,1.0,0.607143,0.333333,1.0
4,0.484483,0.470588,0.333333,1.0,0.285714,0.0,1.0


In [118]:
dummyvars.head()

Unnamed: 0,existingchecking_A11,existingchecking_A12,existingchecking_A13,existingchecking_A14,credithistory_A30,credithistory_A31,credithistory_A32,credithistory_A33,credithistory_A34,purpose_A40,...,housing_A152,housing_A153,job_A171,job_A172,job_A173,job_A174,telephone_A191,telephone_A192,foreignworker_A201,foreignworker_A202
0,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
1,0,0,0,1,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,1,0
2,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,1,0,1,0
3,1,0,0,0,0,0,0,1,0,1,...,0,1,0,0,1,0,1,0,1,0
4,0,0,0,1,0,0,1,0,0,0,...,0,1,0,1,0,0,0,1,1,0


In [119]:
# append the dummy variable of the initial numerical variables numvars
data_clean = pd.concat([data[numvars], dummyvars], axis = 1)
data_std = pd.concat([numdata_std, data['classification'], dummyvars], axis = 1)
data_minmax = pd.concat([numdata_minmax, data['classification'], dummyvars], axis = 1)

In [120]:
data_clean.head()

Unnamed: 0,creditamount,duration,installmentrate,residencesince,age,existingcredits,peopleliable,classification,existingchecking_A11,existingchecking_A12,...,housing_A152,housing_A153,job_A171,job_A172,job_A173,job_A174,telephone_A191,telephone_A192,foreignworker_A201,foreignworker_A202
0,5951,48,2,2,22,1,1,0,0,1,...,1,0,0,0,1,0,1,0,1,0
1,2096,12,2,3,49,1,2,1,0,0,...,1,0,0,1,0,0,1,0,1,0
2,7882,42,2,4,45,1,2,1,1,0,...,0,1,0,0,1,0,1,0,1,0
3,4870,24,3,4,53,2,2,0,1,0,...,0,1,0,0,1,0,1,0,1,0
4,9055,36,2,4,35,1,2,1,0,0,...,0,1,0,1,0,0,0,1,1,0


In [121]:
data_std.head()

Unnamed: 0,0,1,2,3,4,5,6,classification,existingchecking_A11,existingchecking_A12,...,housing_A152,housing_A153,job_A171,job_A172,job_A173,job_A174,telephone_A191,telephone_A192,foreignworker_A201,foreignworker_A202
0,0.94886,2.247553,-0.869196,-0.764966,-1.192617,-0.703918,-0.428543,0,0,1,...,1,0,0,0,1,0,1,0,1,0
1,-0.417216,-0.740102,-0.869196,0.14156,1.190056,-0.703918,2.333487,1,0,0,...,1,0,0,1,0,0,1,0,1,0
2,1.633138,1.749611,-0.869196,1.048086,0.837067,-0.703918,2.333487,1,1,0,...,0,1,0,0,1,0,1,0,1,0
3,0.565792,0.255783,0.025064,1.048086,1.543044,1.028136,2.333487,0,1,0,...,0,1,0,0,1,0,1,0,1,0
4,2.048808,1.251668,-0.869196,1.048086,-0.045404,-0.703918,2.333487,1,0,0,...,0,1,0,1,0,0,0,1,1,0


In [122]:
data_minmax.head()

Unnamed: 0,0,1,2,3,4,5,6,classification,existingchecking_A11,existingchecking_A12,...,housing_A152,housing_A153,job_A171,job_A172,job_A173,job_A174,telephone_A191,telephone_A192,foreignworker_A201,foreignworker_A202
0,0.31369,0.647059,0.333333,0.333333,0.053571,0.0,0.0,0,0,1,...,1,0,0,0,1,0,1,0,1,0
1,0.101574,0.117647,0.333333,0.666667,0.535714,0.0,1.0,1,0,0,...,1,0,0,1,0,0,1,0,1,0
2,0.419941,0.558824,0.333333,1.0,0.464286,0.0,1.0,1,1,0,...,0,1,0,0,1,0,1,0,1,0
3,0.254209,0.294118,0.666667,1.0,0.607143,0.333333,1.0,0,1,0,...,0,1,0,0,1,0,1,0,1,0
4,0.484483,0.470588,0.333333,1.0,0.285714,0.0,1.0,1,0,0,...,0,1,0,1,0,0,0,1,1,0


In [123]:
# Unscaled, unnormalized data
X_clean = data_clean.drop('classification', axis=1)
y_clean = data_clean['classification']

In [124]:
X_std = data_std.drop('classification', axis=1)
y_std = data_std['classification']

In [125]:
# Rescaled data
X_minmax = data_minmax.drop('classification', axis=1)
y_minmax = data_minmax['classification']

In [135]:

# Oversampling
# http://contrib.scikit-learn.org/imbalanced-learn/auto_examples/combine/plot_smote_enn.html#sphx-glr-auto-examples-combine-plot-smote-enn-py
# Apply SMOTE
sm = SMOTE(ratio='auto')
X_clean_res, y_clean_res = sm.fit_sample(X_clean, y_clean)
X_std_res, y_std_res = sm.fit_sample(X_std, y_std)
X_minmax_res, y_minmax_res = sm.fit_sample(X_minmax, y_minmax)


In [127]:
def crossvalidate(clf, X,y):
    scores1 = cross_val_score(clf, X, y, cv=10, scoring='precision')
    scores2 = cross_val_score(clf, X, y, cv=10, scoring='recall')
    scores3 = cross_val_score(clf, X, y, cv=10, scoring='roc_auc')
    # The mean score and standard deviation of the score estimate
    print("Cross Validation Precision: %0.2f (+/- %0.2f)" % (scores1.mean(), scores1.std()))
    print("Cross Validation Recall: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std()))
    print("Cross Validation roc_auc: %0.2f (+/- %0.2f)" % (scores3.mean(), scores3.std()))
    # Create and print confusion matrix

In [132]:
# Run classifier with cross-validation and plot ROC curves
# from http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
def get_crossval_roc(clfname, classifier,X,y):
    cv = StratifiedKFold(n_splits=10)
    #classifier = GradientBoostingClassifier()

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange','cyan', 'indigo', 'seagreen', 'yellow'])
    lw = 2
    i = 0
    for (train, test), color in zip(cv.split(X_clean_res, y_clean_res), colors):
        probas_ = classifier.fit(X_clean_res[train], y_clean_res[train]).predict_proba(X_clean_res[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_clean_res[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=lw, color=color,
                 label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        i += 1
    plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
             label='Luck')

    mean_tpr /= cv.get_n_splits(X_clean_res, y_clean_res)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(clfname+' ROC')
    plt.legend(loc="lower right")
    plt.savefig("CrossVal_ROC_"+clfname)
plt.show()
    

In [133]:
print("SVC")
svc = svm.SVC(kernel='rbf',degree=1,probability=True,C=2,class_weight="balanced")
crossvalidate(svc,X_std_res,y_std_res)
get_crossval_roc("SVC",svc,X_clean_res,y_clean_res)

SVC
Cross Validation Precision: 0.81 (+/- 0.08)
Cross Validation Recall: 0.75 (+/- 0.04)
Cross Validation roc_auc: 0.86 (+/- 0.04)


NameError: name 'cycle' is not defined

In [136]:
# Bagged Decision Trees to determine feature importance
# Use clean data, unstandardized and unscaled, because tree is robust to different scales
etcclf = ExtraTreesClassifier().fit(X_clean_res, y_clean_res)
    
# Select important features using sklearn SelectFromModel, vary treshold, good value turns out to be mean -> select overaverage important features
sfm = SelectFromModel(etcclf, prefit=True, threshold=0.01)

# Apply fitted transformation to datasets
SFMETC_X = sfm.transform(X_clean_res)
SFMETC_y = y_clean_res

# Print shape to see how many features have been selected
print(SFMETC_X.shape)

print("Gaussian Naive Bayes")
gauss_nb = GaussianNB()
crossvalidate(gauss_nb,SFMETC_X,SFMETC_y)
get_crossval_roc("gauss_nb",gauss_nb,SFMETC_X,SFMETC_y)

(1398, 40)
Gaussian Naive Bayes
Cross Validation Precision: 0.83 (+/- 0.18)
Cross Validation Recall: 0.84 (+/- 0.07)
Cross Validation roc_auc: 0.88 (+/- 0.12)


NameError: name 'cycle' is not defined

Object `cycle` not found.
