In [None]:
import matplotlib.pyplot as plt
from skimage import io
from skimage import color
from skimage.transform import resize
import math
from skimage.feature import hog
import numpy as np
import pandas as pd
from PIL import Image
from skimage import data, exposure
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [None]:
# self-written scripts
import sys
sys.path.insert(0, 'Python_Scripts')

import util

In [None]:
!pip install seaborn

In [None]:
!pip install imblearn

In [None]:
!pip install scikit-learn

In [None]:
df = pd.read_csv('data/train_complete.csv')

In [None]:
hog = pd.read_csv('data/train_HOG.csv')

In [None]:
# Count occurrences of `ImageId` in df
df['count'] = df.ImageId.apply(lambda x: df['ImageId'].value_counts()[x])

In [None]:
# isolate only images that have 0 or 1 defect
util.isolate_single_defects(hog)

In [None]:
hog_complete = hog.merge(df[['ClassId','ImageId']], on = 'ImageId')
hog_complete.head()

In [None]:
#Split Hog_complete in X und Y
X = hog_complete.drop(['ClassId','ImageId'], axis =1)
y = hog_complete['ClassId']
y

In [None]:
from sklearn.model_selection import train_test_split
# Split without oversampled data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y, random_state = 42)
# Split with oversampling
#X_train, X_test, y_train, y_test = train_test_split(X_ros,y_ros,test_size=0.2,stratify=y_ros)
print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

In [None]:
#Oversampling with RandomOversampler
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [None]:
#Oversampling with SMOTE
X_train_smo, y_train_smo = SMOTE().fit_resample(X_train, y_train)

In [None]:
test_accuracy = []
scaler = StandardScaler()

#X_train_scaled = scaler.fit_transform(X_train)
#X_train_scaled = scaler.fit_transform(X_train_ros)
X_train_scaled = scaler.fit_transform(X_train_smo)
X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3,algorithm='brute')
#classifier.fit(X_train_scaled, y_train)
#classifier.fit(X_train_scaled, y_train_ros)
classifier.fit(X_train_scaled, y_train_smo)

y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

# results without oversampling   
    
           precision    recall  f1-score   support

           0       0.74      0.88      0.81      1181
           1       0.46      0.56      0.51       154
           2       0.38      0.72      0.50        39
           3       0.87      0.70      0.77       952
           4       0.67      0.02      0.04       103

    accuracy                           0.75      2429
   macro avg       0.63      0.58      0.53      2429
weighted avg       0.77      0.75      0.74      2429

# results with random oversampling  
           precision    recall  f1-score   support

           0       0.80      0.80      0.80      1181
           1       0.35      0.73      0.47       154
           2       0.31      0.82      0.45        39
           3       0.85      0.71      0.77       952
           4       0.44      0.11      0.17       103

    accuracy                           0.73      2429
   macro avg       0.55      0.63      0.53      2429
weighted avg       0.77      0.73      0.74      2429

# results with SMOTE oversampling  
            precision    recall  f1-score   support

           0       0.98      0.39      0.55      1181
           1       0.15      0.85      0.25       154
           2       0.17      0.87      0.29        39
           3       0.87      0.53      0.66       952
           4       0.19      0.56      0.28       103

    accuracy                           0.49      2429
   macro avg       0.47      0.64      0.41      2429
weighted avg       0.84      0.49      0.56      2429

---

## SURF

In [None]:
surf = pd.read_csv('data/train_surf.csv')

In [None]:
util.isolate_single_defects(surf)

In [None]:
surf_complete = surf.merge(df[['ClassId','ImageId']], on = 'ImageId')
surf_complete.head()

In [None]:
# Drop irrelevant columns and split surf_complete into X and y
X_surf = surf_complete.drop(['ClassId','ImageId','keypoints'], axis =1)
y_surf = surf_complete['ClassId']

In [None]:
#Oversampling
ros = RandomOverSampler(random_state=42)
X_sros, y_sros = ros.fit_resample(X_surf, y_surf)

In [None]:
from sklearn.model_selection import train_test_split
# Split without oversampled data
#X_train, X_test, y_train, y_test = train_test_split(X_surf, y_surf, test_size=0.2, stratify=y_surf)
# Split with oversampling
X_train, X_test, y_train, y_test = train_test_split(X_sros, y_sros, test_size=0.2, stratify=y_sros)
print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

In [None]:
test_accuracy = []
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3, algorithm='brute')
classifier.fit(X_train_scaled, y_train)

y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

---

### KNN-SURF without oversampling
      precision    recall  f1-score   support

           0       0.57      0.77      0.65      1181
           1       0.35      0.14      0.20       154
           2       0.12      0.03      0.04        39
           3       0.53      0.43      0.47       952
           4       0.40      0.02      0.04       103

    accuracy                           0.55      2429
   macro avg       0.40      0.28      0.28      2429
weighted avg       0.53      0.55      0.52      2429

In [None]:
surf_complete.groupby(['ClassId']).mean().NumberKP