In [1]:
import pandas as pd
import utilities as utils

df = pd.read_csv('artifact.csv')

for i, file in enumerate(['miscal.csv', 'featureless.csv', 'dust.csv']):
    new_df = pd.read_csv(file)
    new_df['Output'] = new_df['Output'] * (i+2)
    df = pd.concat([df, new_df])

df['Filename'] = df['Filename'].apply(lambda x: x.split('/')[-1])
df = df.drop_duplicates(subset='Filename')
df.head()

Unnamed: 0,Filename,HistMean,HistVar,LaplVar,MaxLapl,MaxVar,MaxAvgLapl,MinAvgLapl,IQR,Deviation,...,AkazeNumKP,BriskNumKP,KazeNumKP,ORBNumKP,RidgeAreaProp,MaxX,AvgMaxX,MaxY,MinVar,Output
0,ESP_011314_1585_RED.NOMAP.browse-Block-7.jpg,172.465515,720.605256,613.496113,255.283676,3141.374252,297.757073,448.868534,29.0,22.579438,...,4,58,27,127,0.422052,46113,44643.0,32988,107.880293,1
1,ESP_011325_1845_RED.NOMAP.browse-Block-10.jpg,114.768188,9042.951879,1075.129234,299.942612,5497.009506,601.463473,919.15874,200.0,2099.592821,...,6,48,65,27,0.238823,599118,595215.0,55707,89.108415,1
2,ESP_011325_1845_RED.NOMAP.browse-Block-12.jpg,85.229126,5737.952018,989.958606,465.874008,5698.864609,487.412923,899.626289,153.0,245.102714,...,8,57,61,94,0.282823,388314,378452.0,36999,141.611445,1
3,ESP_011325_1845_RED.NOMAP.browse-Block-14.jpg,79.706726,5033.211659,1190.971659,374.875717,8084.393661,545.449855,1245.502768,134.0,507.546173,...,2,19,60,50,0.271832,429249,411315.0,42144,182.914684,1
4,ESP_011325_1845_RED.NOMAP.browse-Block-16.jpg,93.042786,6122.234436,932.756638,109.927353,7019.038086,553.311266,905.255167,165.0,1823.800329,...,5,52,44,101,0.287259,515292,498016.0,89109,75.190465,1


In [2]:
groups = df['Filename'].apply(lambda x: x.split('.')[0]).astype('category').cat.codes

In [3]:
from sklearn.model_selection import GroupShuffleSplit

main_x, main_y = df.drop(columns=['Filename', 'Output']).values, df['Output'].values

cv = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=1337)
train_indices, test_indices = next(cv.split(main_x, main_y, groups))
X, Y = main_x[train_indices], main_y[train_indices]
x_test, y_test = main_x[test_indices], main_y[test_indices]

In [4]:
from scipy.stats import uniform
penalty = ['l1', 'l2']
C = uniform(loc=0, scale=4)
hyperparameters = dict(C=C, penalty=penalty)

In [5]:
groups = df.iloc[train_indices]['Filename'].apply(lambda x: x.split('.')[0]).astype('category').cat.codes

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

cv = GroupKFold(n_splits=10)
clf = RandomizedSearchCV(LogisticRegression(random_state=1337), hyperparameters, random_state=1337, n_iter=100, cv=cv, verbose=0, n_jobs=-1)
hyp = clf.fit(X,Y,groups=groups)

In [None]:
hyp.best_estimator_.get_params()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score


n_classes = 5
y_score = hyp.best_estimator_.decision_function(x_test)
y = label_binarize(y_test, classes=[0, 1, 2, 3, 4])

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
lw=2
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(figsize=(20,15))
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])

labels = ['Good', 'Artifact', 'Miscal', 'Featureless', 'Dusty']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(labels[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel('False Positive Rate', size=20)
plt.ylabel('True Positive Rate', size=20)
plt.title('Multi-class ROC Curve', size=30)
plt.legend(loc="lower right", prop={'size': 20})
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

y_pred = hyp.best_estimator_.predict(x_test)
print("Accuracy: " + str(round(accuracy_score(y_test, y_pred), 2)))
confusion_matrix(y_test, y_pred)