In [None]:
import matplotlib.pyplot as plt
from skimage import io
from skimage import color
from skimage.transform import resize
import math
from skimage.feature import hog
import numpy as np
import pandas as pd
from PIL import Image
from skimage import data, exposure
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn import svm

In [None]:
# self-written scripts
import sys
sys.path.insert(0, 'Python_Scripts')

import util
import surf_hog_analysis 

In [None]:
!pip install seaborn

In [None]:
!pip install imblearn

In [None]:
!pip install scikit-learn

---

### Data preparation

In [None]:
df = pd.read_csv('data/train_complete.csv')

In [None]:
# Count occurrences of `ImageId` in df
df['count'] = df.ImageId.apply(lambda x: df['ImageId'].value_counts()[x])

---

# KNN

---

## Use HoG Feature Vector (based on 4 imbalanced classes in train_images) in KNN

In [None]:
hog = pd.read_csv('data/train_HOG.csv')

In [None]:
# isolate only images that have 0 or 1 defect
util.isolate_single_defects(hog)

In [None]:
hog_complete = hog.merge(df[['ClassId','ImageId']], on = 'ImageId')
hog_complete.head()

In [None]:
# eliminate class 0 in dataframe
hog_complete = hog_complete.query('ClassId != 0')

In [None]:
hog_complete.groupby('ClassId')['ImageId'].count()

In [None]:
#Split Hog_complete in X und Y
X = hog_complete.drop(['ClassId','ImageId'], axis =1)
y = hog_complete['ClassId']

In [None]:
from sklearn.model_selection import train_test_split
# Split without oversampled data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y, random_state = 42)
print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

In [None]:
#Oversampling with RandomOversampler
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [None]:
#Oversampling with SMOTE
X_train_smo, y_train_smo = SMOTE().fit_resample(X_train, y_train)

In [None]:
test_accuracy = []
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
#X_train_scaled = scaler.fit_transform(X_train_ros)
#X_train_scaled = scaler.fit_transform(X_train_smo)
X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3,algorithm='brute')
classifier.fit(X_train_scaled, y_train)
#classifier.fit(X_train_scaled, y_train_ros)
#classifier.fit(X_train_scaled, y_train_smo)

y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

#### results without oversampling   
    
           precision    recall  f1-score   support

           0       0.74      0.88      0.81      1181
           1       0.46      0.56      0.51       154
           2       0.38      0.72      0.50        39
           3       0.87      0.70      0.77       952
           4       0.67      0.02      0.04       103

    accuracy                           0.75      2429
   macro avg       0.63      0.58      0.53      2429
weighted avg       0.77      0.75      0.74      2429

#### results with random oversampling  
           precision    recall  f1-score   support

           0       0.80      0.80      0.80      1181
           1       0.35      0.73      0.47       154
           2       0.31      0.82      0.45        39
           3       0.85      0.71      0.77       952
           4       0.44      0.11      0.17       103

    accuracy                           0.73      2429
   macro avg       0.55      0.63      0.53      2429
weighted avg       0.77      0.73      0.74      2429

#### results with SMOTE oversampling  
            precision    recall  f1-score   support

           0       0.98      0.39      0.55      1181
           1       0.15      0.85      0.25       154
           2       0.17      0.87      0.29        39
           3       0.87      0.53      0.66       952
           4       0.19      0.56      0.28       103

    accuracy                           0.49      2429
   macro avg       0.47      0.64      0.41      2429
weighted avg       0.84      0.49      0.56      2429

---

### Visualization of falsely predicted pictures

In [None]:
#surf_hog_analysis.print_false_classifications(df, hog_complete, y_test, y_pred)

---

#### Piece-by-piece visualization

In [None]:
# extract indices of our `hog_complete` data frame where predictions were incorrect
false_predictions = (y_pred!=y_test)
false_predictions = false_predictions[false_predictions].index.values

In [None]:
y_pred[(y_pred!=y_test)]

In [None]:
y_pred_false =y_pred[(y_pred!=y_test)]

predictions = pd.DataFrame(y_pred_false, index=false_predictions,columns=['ClassId_predicted'])
predictions

In [None]:
# extract all rows from `hog_complete` where the predcition was incorrect
false_predicted_images = hog_complete.join(predictions).loc[false_predictions][['ImageId','ClassId', 'ClassId_predicted']]
# add additional information needed to find the correponding pictures
false_predicted_images = false_predicted_images.merge(df[['FilePath','ImageId']], on = 'ImageId')


In [None]:
false_predicted_images

In [None]:
number_images = 5

random_index = np.round(np.random.rand(number_images) * len(false_predicted_images.ImageId)) + 1
#print(random_index)

for i in range(number_images):
    #random_index = 'random_index_' + str(i+1)
    #print(random_index)
    file_path_to_image = false_predicted_images['FilePath'][random_index[i]]
    class_id = false_predicted_images['ClassId'][random_index[i]]
    image_id = false_predicted_images['ImageId'][random_index[i]]
    class_id_pred = int(false_predicted_images['ClassId_predicted'][random_index[i]])
    
    img = io.imread(file_path_to_image)
    plt.figure(figsize=(18, 10))
    ax = plt.subplot(number_images, 1, i + 1)
    plt.imshow(img)
    plt.title(f'Image ID: {image_id} | True ClassId: {class_id} | Predicted ClassId: {class_id_pred}', fontsize=16);
    plt.axis("off")
    
"""
file_path_to_image = false_predicted_images['FilePath'][random_index]
class_id = false_predicted_images['ClassId'][random_index]
image_id = false_predicted_images['ImageId'][random_index]
class_id_pred = int(false_predicted_images['ClassId_predicted'][random_index])
print(image_id)

img = io.imread(file_path_to_image)

plt.figure(figsize=(18, 3))
plt.imshow(img)
plt.title(f'Image ID: {image_id} | True ClassId: {class_id} | Predicted ClassId: {class_id_pred}', fontsize=16);
""";

## Use HoG Feature Vector (based on augmented train_images; balanced) in KNN
#### Use augmented hog pictures for Training an initial hog pictures (hog_complete) for test

In [None]:
df2 = pd.read_csv('data/train_single_defects_augmented.csv')
df2.head()

In [None]:
# augmented pictures are based on single defect pictures of classes 1-4
hog_augmented = pd.read_csv('data/train_HOG_augmented.csv')
hog_augmented.head()

In [None]:
hog_augmented_complete = hog_augmented.merge(df2[['ClassId','ImageId']], on = 'ImageId')
hog_augmented_complete.head()

In [None]:
hog_augmented_complete.groupby('ClassId')['ImageId'].count()

In [None]:
# Idea: use augmented hog pictures for Training an initial hog pictures (hog_complete) for test
#Split Hog_complete in X und Y
X_train = hog_augmented_complete.drop(['ClassId','ImageId'], axis =1)
y_train = hog_augmented_complete['ClassId']

# eliminate "class 0" in hog_complete
hog_complete2 = hog_complete.query('ClassId != 0')
X_test = hog_complete2.drop(['ClassId','ImageId'], axis =1)
y_test = hog_complete2['ClassId']
print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

In [None]:

test_accuracy = []
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3,algorithm='brute')
classifier.fit(X_train_scaled, y_train)


y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

###  same model without class 3

In [None]:

#Idea: use augmented hog pictures for Training an initial hog pictures (hog_complete) for test
#Split Hog_complete in X und Y
# eliminate "class 3 " in hog_augmented_complete
hog_augmented_complete2 = hog_augmented_complete.query('ClassId != 3') 
X_train = hog_augmented_complete2.drop(['ClassId','ImageId'], axis =1)
y_train = hog_augmented_complete2['ClassId']

# eliminate "class 3" in hog_complete2 (Class 0 is already eliminated)
hog_complete3 = hog_complete2.query('ClassId != 3')
X_test = hog_complete3.drop(['ClassId','ImageId'], axis =1)
y_test = hog_complete3['ClassId']
print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

In [None]:

test_accuracy = []
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3,algorithm='brute')
classifier.fit(X_train_scaled, y_train)


y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

#### Use augmented hog pictures for Training and test (split data)

In [None]:
#hog_augmented_complete2 = hog_augmented_complete.query('ClassId != 3') 
X = hog_augmented_complete.drop(['ClassId','ImageId'], axis =1)
y = hog_augmented_complete['ClassId']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y, random_state = 42)
print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

In [None]:
test_accuracy = []
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3,algorithm='brute')
classifier.fit(X_train_scaled, y_train)


y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

### Use augmented hog pictures for Training and test (split data) with Blur!

In [None]:
# augmented pictures are based on single defect pictures of classes 1-4
hog_augmented_blur = pd.read_csv('data/train_HOG_augmented_blur.csv')
hog_augmented_blur.head()

In [None]:
hog_augmented_blur_complete = hog_augmented_blur.merge(df2[['ClassId','ImageId']], on = 'ImageId')
hog_augmented_blur_complete.shape

In [None]:
#hog_augmented_blur_complete2 = hog_augmented_blur_complete.query('ClassId != 3') 
X = hog_augmented_blur_complete.drop(['ClassId','ImageId'], axis =1)
y = hog_augmented_blur_complete['ClassId']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y, random_state = 42)
print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

In [None]:
test_accuracy = []
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3,algorithm='brute')
classifier.fit(X_train_scaled, y_train)


y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

---

## SURF

In [None]:
surf = pd.read_csv('data/train_surf.csv')

In [None]:
util.isolate_single_defects(surf)

In [None]:
surf_complete = surf.merge(df[['ClassId','ImageId']], on = 'ImageId')

In [None]:
# Drop irrelevant columns and split surf_complete into X and y
X_surf = surf_complete.drop(['ClassId','ImageId','keypoints'], axis =1)
y_surf = surf_complete['ClassId']

In [None]:
# Split without oversampled data
X_train, X_test, y_train, y_test = train_test_split(X_surf, y_surf, test_size=0.2, 
                                                    stratify=y_surf, random_state=42)

---

#### Without Oversampling

In [None]:
test_accuracy = []
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3, algorithm='brute')
classifier.fit(X_train_scaled, y_train)

y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
# print out several misclassified images
surf_hog_analysis.print_false_classifications(df, surf_complete, y_test, y_pred)

---

#### Random Oversampling

In [None]:
#Oversampling
ros = RandomOverSampler(random_state=42)
X_sros, y_sros = ros.fit_resample(X_train, y_train)

In [None]:
test_accuracy = []
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_sros)
X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3, algorithm='brute')
classifier.fit(X_train_scaled, y_sros)

y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

---

#### SMOTE Oversampling

In [None]:
#Oversampling with SMOTE
X_train_smo, y_train_smo = SMOTE().fit_resample(X_train, y_train)

In [None]:
test_accuracy = []
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smo)
X_test_scaled = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=3, algorithm='brute')
classifier.fit(X_train_scaled, y_train_smo)

y_pred = classifier.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

---

# SVM

In [None]:
svm_model = svm.SVC(kernel='rbf', gamma=0.001, C = 100)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_model.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_model.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

---

### Random Oversampling

In [None]:
#Oversampling
ros = RandomOverSampler(random_state=42)
X_sros, y_sros = ros.fit_resample(X_train, y_train)

In [None]:
test_accuracy = []
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_sros)
X_test_scaled = scaler.transform(X_test)

svm_model.fit(X_train_scaled, y_sros)

y_pred = svm_model.predict(X_test_scaled)
test_accuracy = classifier.score((X_test_scaled), y_test)
print(test_accuracy)

In [None]:
print(metrics.confusion_matrix(y_test,y_pred))
# Print confusion matrix
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), annot=True, cmap='YlGn');

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'kernel': ['rbf'], 
               'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
               'C': [1, 10, 100, 1000]},
              {'kernel': ['linear'], 
               'C': [1, 10, 100, 1000]}]

grid = GridSearchCV(model, param_grid, verbose=True, n_jobs=-1)

result = grid.fit(X_train, y_train)

In [None]:
# Print best parameters 
print('Best Parameters:', result.best_params_)

# Print best score
print('Best Score:', result.best_score_)

In [None]:
surf_complete.groupby(['ClassId']).mean().NumberKP