In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
import pydotplus
from sklearn import tree
from IPython.display import Image
import itertools
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
import os, errno
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import DBSCAN


# Utilities 

In [2]:

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues, save="False", path="/home/"):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    if save is True:
        plt.savefig(path+".pdf")
    plt.clf()
    
def plot_confusion_matrix2(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


def saveInfos(fileName, class1, class2, rat, roc, best_params, graph, attributes, feature_importance, cva, cvf, report1, a1, r1, p1, f1, cm1, report2, a2, r2, p2, f2, cm2, rfroc, rf_best_params, feature_importance2, cvarf, cvfrf, report3, a3, r3, p3, f3, cm3):
    file_path = '/home/alessandro/Desktop/DM_JAN/models/'+str(fileName)+"/"
    print(file_path)
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        print("Dir already exists")
        return
    graph.write_png(file_path+"dtree.png")
    plot_confusion_matrix(cm1, classes=class1, normalize=True, title='Normalized confusion matrix', save=True, path=file_path+"cmTrain")
    plot_confusion_matrix(cm2, classes=class1, normalize=True, title='Normalized confusion matrix', save=True, path=file_path+"cmTest")
    plot_confusion_matrix(cm3, classes=class2, normalize=True, title='Normalized confusion matrix', save=True, path=file_path+"cmTestRF")
    txt = file_path+"infos.txt"
    file = open(txt,"w") 
    file.write("Oversampler Ratio: {}\n".format(rat)) 
    file.write("DecisionTree: Area under the ROC curve = {}\n".format(roc)) 
    file.write("Best Params: {}\n".format(best_params))
    for col, imp in zip(attributes, feature_importance):
        file.write("Feature Importance: {} {}\n".format(col, imp))
    file.write("CrossValidScore_ACC_F1_Score: {} {}\n".format(cva,cvf))
    file.write("***************\n") 
    file.write("Report_Train: {}\n".format(report1))
    file.write("***************\n") 
    file.write("Accuracy_Train: {}\n".format(a1))
    file.write("Recall_Train: {}\n".format(r1))
    file.write("Precision_Train: {}\n".format(p1))
    file.write("F1_Score_Train: {}\n".format(f1))
    file.write("##############################\n") 
    file.write("**************\n*") 
    file.write("Report_Test: {}\n".format(report2))
    file.write("***************\n") 
    file.write("Accuracy_Test: {}\n".format(a2))
    file.write("Recall_Test: {}\n".format(r2))
    file.write("Precision_Test: {}\n".format(p2))
    file.write("F1_Score_Test: {}\n".format(f2))
    file.write("##############################\n") 
    file.write("RandomForest: Area under the ROC curve = {}\n".format(rfroc)) 
    file.write("Best Params: {}\n".format(rf_best_params)) 
    for col, imp in zip(attributes, feature_importance2):
        file.write("Feature ImportanceRF: {} {}\n".format(col, imp)) 
    file.write("***************\n") 
    file.write("Report_Test: {}\n".format(report3))
    file.write("***************\n") 
    file.write("Accuracy_Test: {}\n".format(a3))
    file.write("Recall_Test: {}\n".format(r3))
    file.write("Precision_Test: {}\n".format(p3))
    file.write("F1_Score_Test: {}\n".format(f3))
    file.close() 


# Filtering Attributes

In [3]:
reduced = pd.read_csv("/home/dataset.csv")
reduced["sums"] = reduced.iloc[:, 5:10].sum(axis=1)
reduced["g3"] = reduced.iloc[:, 5:10].ge(1,axis=1).sum(axis=1)

cols = reduced.columns.tolist()
cols.remove('age')
cols.remove('status')
cols.remove('education')
cols.remove('sex')
cols.remove('credit_default')
cols.remove('pa-apr')
cols.remove('pa-may')
cols.remove('pa-jun')
cols.remove('pa-jul')
cols.remove('pa-aug')
cols.remove('pa-sep')
cols.remove('ps-apr')
cols.remove('ps-may')
cols.remove('ps-jun')
cols.remove('ps-jul')
cols.remove('ps-aug')
cols.remove('ps-sep')
cols.remove('sums')
cols.remove('ba-apr')
cols.remove('ba-may')
cols.remove('ba-jun')
cols.remove('ba-jul')
cols.remove('ba-aug')
cols.remove('ba-sep')
cols.remove('g3')

    
reduced = reduced.filter(['g3','ps-sep','ps-apr','ps-jul','ps-aug','ps-jun','ps-may','credit_default'], axis=1)
#reduced = reduced.filter(['ps-sep','g3','ps-apr','credit_default'], axis=1)
#reduced= reduced.drop(columns=['limit','varps','ps-apr','ps-may','ps-aug','ps-jul','ps-jun','ba-sep','ba-sep','ba-aug','ba-jul','ba-jun','ba-may','ba-apr','pa-sep','pa-aug','pa-jun','pa-may','pa-apr','sums'])
#print(reduced.head)



FileNotFoundError: [Errno 2] File b'/home/dataset.csv' does not exist: b'/home/dataset.csv'

# Spit dataset in Training and Test set

In [None]:
attributes = [col for col in reduced.columns if col != 'credit_default']
X = reduced[attributes]
y = reduced['credit_default']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=100, stratify=y)

# UnderSampling

In [None]:
rat=1

sm = RandomUnderSampler(ratio=rat, random_state=42)

X_train, y_train = sm.fit_sample(X_train, y_train)
X_train.shape, y_train.shape 
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Grid Search

In [None]:

param_grid = {'criterion': ['gini', 'entropy'],'min_samples_split' : range(2,300,50),'min_samples_leaf' : range(2,300,50),'max_depth': np.arange(2, 7)}
dt1 = GridSearchCV(DecisionTreeClassifier(splitter="best"), param_grid, cv=StratifiedKFold(15), scoring='accuracy')
dt1_fit = dt1.fit(X_train, y_train)

tree_performance = roc_auc_score(y_test, dt1_fit.predict_proba(X_test)[:, 1])
print("DecisionTree: Area under the ROC curve = {}".format(tree_performance))
opt_dt1 = dt1_fit.best_estimator_
print("{}".format(dt1_fit.best_params_))


# Cross Validation

In [None]:
scores = cross_val_score(opt_dt1, X, y, cv=10)
print('Accuracy: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
cva=('Accuracy: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

scores = cross_val_score(opt_dt1, X, y, cv=10, scoring='f1_macro')
print('F1-score: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
cvf=('F1-score: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

# Decision Tree


In [None]:
for col, imp in zip(attributes, opt_dt1.feature_importances_):
    print(col, imp)
    
dot_data = tree.export_graphviz(opt_dt1, out_file=None,  
                                feature_names=attributes, 
                                class_names=opt_dt1.classes_,  
                                filled=True, rounded=True,  
                                special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

In [None]:
y_pred1 = opt_dt1.predict(X_train)
a1 = ('Accuracy %s' % accuracy_score(y_train, y_pred1))
print(a1)
r1 = ('Recall: %s' %  recall_score(y_train, y_pred1, average='weighted'))
print(r1)
p1 = ('Precision: %s' %  precision_score(y_train, y_pred1, average='weighted'))
print(p1)
f1=('F1-score %s' % f1_score(y_train, y_pred1, average='weighted'))
print(f1)

In [None]:
report1 = classification_report(y_train, y_pred1)
print(report1)
cm1 = confusion_matrix(y_train, y_pred1)
plot_confusion_matrix(cm1, classes=opt_dt1.classes_, normalize=True, title='Normalized confusion matrix')

In [None]:
y_pred2 = opt_dt1.predict(X_test)
a2=('Accuracy %s' % accuracy_score(y_test, y_pred2))
print(a2)
r2=('Recall: %s' %  recall_score(y_test, y_pred2, average='weighted'))
print(r2)
p2=('Precision: %s' %  precision_score(y_test, y_pred2, average='weighted'))
print(p2)
f2=('F1-score %s' % f1_score(y_test, y_pred2, average='weighted'))
print(f2)

In [None]:
report2 = classification_report(y_test, y_pred2)
print(report2)
cm2 =confusion_matrix(y_test, y_pred2)
plot_confusion_matrix(cm2, classes=opt_dt1.classes_, normalize=True, title='Normalized confusion matrix')

# Random Forest

In [None]:

param_grid = {'criterion': ['gini', 'entropy'], 'n_estimators':[50,100,200],'min_samples_split' : range(2,300,50),'min_samples_leaf' : range(2,300,50),'max_depth': np.arange(1,3)}
rf1 = GridSearchCV(RandomForestClassifier(), param_grid, cv=StratifiedKFold(15), scoring='accuracy', n_jobs=-1)
rf1_fit = rf1.fit(X_train, y_train)

rf_performance = roc_auc_score(y_test, rf1_fit.predict_proba(X_test)[:, 1])
print("RandomForest: Area under the ROC curve = {}".format(rf_performance))
opt_rf1 = rf1_fit.best_estimator_
print("{}".format(rf1_fit.best_params_))

for col, imp in zip(attributes, opt_rf1.feature_importances_):
    print(col, imp)
    

In [None]:
scores = cross_val_score(opt_rf1, X, y, cv=10)
cvarf = ('Accuracy: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
print(cvarf)

scores = cross_val_score(opt_rf1, X, y, cv=10, scoring='f1_macro')
cvfrf = ('F1-score: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
print(cvfrf)

y_predrfo = opt_rf1.predict(X_test)
a3=('Accuracy %s' % accuracy_score(y_test, y_predrfo))
print(a3)
r3=('Recall: %s' %  recall_score(y_test, y_predrfo, average='weighted'))
print(r3)
p3=('Precision: %s' %  precision_score(y_test, y_predrfo, average='weighted'))
print(p3)
f3=('F1-score %s' % f1_score(y_test, y_predrfo, average='weighted'))
print(f3)

report3=(classification_report(y_test, y_predrfo))
plt.figure()
cm3 =confusion_matrix(y_test, y_predrfo)
plot_confusion_matrix(cm3, classes=opt_rf1.classes_, normalize=True, title='Normalized confusion matrix')

In [None]:
fileName=(str(a2)+str(r2)+str(p2)+str(f2)+str(tree_performance)).replace(" ", "")
saveInfos(fileName,
          opt_dt1.classes_,
          opt_rf1.classes_, 
          rat,
          tree_performance, 
          dt1_fit.best_params_, 
          graph,
          attributes,
          opt_dt1.feature_importances_, 
          cva, 
          cvf, 
          report1, 
          a1,
          r1,
          p1,
          f1,
          cm1,
          report2,
          a2,
          r2,
          p2,
          f2,
          cm2, 
          rf_performance, 
          rf1_fit.best_params_, 
          opt_rf1.feature_importances_, 
          cvarf, 
          cvfrf, 
          report3, 
          a3, 
          r3, 
          p3, 
          f3, 
          cm3
         )


# Multi Layer Perceptron

In [None]:

activation=['relu', 'tanh', 'logistic']
solver=['lbfgs', 'sgd', 'adam'] 
alpha=[0.0001,1e-5,0.01,0.001]
batch_size=[1,100]
learning_rate=['constant'] 
learning_rate_init=[0.001,0.01,0.2,0.3, 0.8, 1] 

param_grid=dict(hidden_layer_sizes=(7,), activation=activation, batch_size=batch_size,learning_rate=learning_rate,learning_rate_init=learning_rate_init)
mlp = MLPClassifier(random_state=100, max_iter=1000)
grid = GridSearchCV(mlp, param_grid, cv=StratifiedKFold(15), scoring='f1')

grid_result=grid.fit(X_train, y_train)
nn_performance = roc_auc_score(y_test, grid_result.predict_proba(X_test)[:, 1])
print("NeuralNet: Area under the ROC curve = {}".format(nn_performance))
opt_nn = grid_result.best_estimator_
print("{}".format(grid_result.best_params_))



In [None]:
y_pred3 = opt_nn.predict(X_train)
a1 = ('Accuracy %s' % accuracy_score(y_train, y_pred3))
print(a1)
r1 = ('Recall: %s' %  recall_score(y_train, y_pred3, average='weighted'))
print(r1)
p1 = ('Precision: %s' %  precision_score(y_train, y_pred3, average='weighted'))
print(p1)
f1=('F1-score %s' % f1_score(y_train, y_pred3, average='weighted'))
print(f1)
report3 = classification_report(y_train, y_pred3)
print(report3)
cm4 =confusion_matrix(y_train, y_pred3)
plot_confusion_matrix2(cm4, classes=opt_nn.classes_, normalize=True, title='Normalized confusion matrix')


In [None]:

y_pred4 = opt_nn.predict(X_test)
a2=('Accuracy %s' % accuracy_score(y_test, y_pred4))
print(a2)
r2=('Recall: %s' %  recall_score(y_test, y_pred4, average='weighted'))
print(r2)
p2=('Precision: %s' %  precision_score(y_test, y_pred4, average='weighted'))
print(p2)
f2=('F1-score %s' % f1_score(y_test, y_pred4, average='weighted'))
print(f2)

report4 = classification_report(y_test, y_pred4)
print(report4)
cm5 =confusion_matrix(y_test, y_pred4)
plot_confusion_matrix2(cm5, classes=opt_nn.classes_, normalize=True, title='Normalized confusion matrix')

In [3]:
from sklearn.preprocessing import LabelEncoder


des = pd.read_csv("/home/alessandro/Desktop/all/credit_default_test.csv")

des = des.drop(columns=["index"])

des['sex'].fillna(des['sex'].mode()[0], inplace=True)

des['education'].fillna(des['education'].mode()[0], inplace=True)

des['status'].fillna(des['status'].mode()[0], inplace=True)

label_encoders = dict()
column2encode = ['sex']

for col in column2encode:
    le = LabelEncoder()
    des[col] = le.fit_transform(des[col])
    label_encoders[col] = le

label_encoders = dict()
column2encode = ['education']

for col in column2encode:
    le = LabelEncoder()
    des[col] = le.fit_transform(des[col])
    label_encoders[col] = le
    
label_encoders = dict()
column2encode = ['status']

for col in column2encode:
    le = LabelEncoder()
    des[col] = le.fit_transform(des[col])
    label_encoders[col] = le

des["sums"] = des.iloc[:, 5:10].sum(axis=1)
des["g3"] = des.iloc[:, 5:10].ge(1,axis=1).sum(axis=1)

y_pred = opt_nn.predict(des)
da = pd.DataFrame(y_pred)
da['credit_default'].replace({0: "no", 1: "yes"}, inplace=True)

da.to_csv("/home/pred.csv", encoding='utf-8', index=True)

ModuleNotFoundError: No module named 'sklearn'