---
## Building a `.csv` File with all Available Images

Tutorial for [writing csv in Python](https://www.pythontutorial.net/python-basics/python-write-csv-file/)

Useful information on [pathlib](https://www.atqed.com/python-current-path)

In [None]:
import numpy as np
import pandas as pd

import pathlib
import IPython.display as display
from PIL import Image
import cv2

from skimage.feature import hog
from skimage import io
from skimage.transform import resize

from sklearn.model_selection import train_test_split

import csv


# self-written scripts
import sys
sys.path.insert(0, 'Python_Scripts')

import util
import surf_hog_analysis
import surf_handling

SEED = 42

---

### File Path Construction

In [None]:
home_path = str(pathlib.Path.home())
# get current working directory
cwd = pathlib.Path.cwd()

# build complete paths for `train_data` and `test_data`
# use `.joinpath()` to ensure operating system conform paths
train_data_dir = cwd.joinpath('data', 'train_images')
test_data_dir = cwd.joinpath('data', 'test_images')

In [None]:
test_data_dir

In [None]:
# Count number of images in folder
image_count = len(list(train_data_dir.glob('*.jpg')))
print("We have", image_count, "training images.")

In [None]:
# print out first 2 elements via UNIX commands
!head -3 data/train.csv > /tmp/input.csv 
!cat /tmp/input.csv

In [None]:
# Display a few images
images = list(train_data_dir.glob('*.jpg'))

for image in images[:5]:
    display.display(Image.open(str(image)))
    print(image.as_posix())

For our complete csv-file we will first extract all `ImageIds` from `train.csv`. Since there are images with more than one defect, and, hence, more than 1 line in `train.csv`, we will concat the missing image IDs to `train.csv`. To obtain the missing IDs, we construct a complete list of all images, eliminate all lines with `ImageIds` from `train.csv` and then concatenate.

---

### Prepare train.csv

In [None]:
df_defects = pd.read_csv('data/train.csv')
# create image paths for 
defect_paths = df_defects.ImageId.apply(lambda x: train_data_dir.joinpath(x))
# add column to the left of the data frame
df_defects = pd.concat([pd.Series(defect_paths, name='FilePath'), df_defects], axis = 1)
df_defects.FilePath[0]

In [None]:
# isolate `ImageIds` for images with defect
defect_ids = df_defects.ImageId.unique()

---

### Building the CSV-File

Create a csv file with all image paths, the respective `ImageId` and an initialisation for `ClassId` and `EncodedPixels`.

In [None]:
header = ['FilePath', 'ImageId', 'ClassId', 'EncodedPixels']

rows = []

for image in images:
    # `.as_posix()` returns the complete path
    # `.name` returns the image name
    # set `ClassId` and `EncodedPixels` to 0
    rows.append([image.as_posix(), image.name, 0, '0'])
    
with open(train_data_dir.parent.joinpath('train_raw.csv'), # `.parent` returns the path up to the data directory
          'w', 
          encoding = 'UTF8',
          newline = '' # avoid blank lines between rows
         ) as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(rows) # write row into file

In [None]:
df_raw = pd.read_csv('data/train_raw.csv')

# get indices of `df_raw` for row dropping
indices = []
for idx, row in df_raw.iterrows():
    if row.ImageId in defect_ids:
        indices.append(idx)

In [None]:
# check wether all indices or defected images are caught
len(indices)

In [None]:
df_raw.drop(indices, inplace=True)
df_raw

In [None]:
# add all rows of unclassified images to the defected images
df_complete = pd.concat([df_defects, df_raw], axis=0, ignore_index=True)
df_complete['Defect'] = df_complete.ClassId.apply(lambda x: 1 if x > 0 else 0)
df_complete.to_csv('data/train_complete.csv', sep=',', index=False)

In [None]:
# eliminate unused csv file
!rm -f data/train_raw.csv

---

## Build `dfs` for Single Defects and Augmentations

In [None]:
"""Only execute once to create `.csv` file"""
df = pd.read_csv('data/train_complete.csv')

df = util.add_blackness_attributes(df.query('Defect==1'), 'train_images')

util.isolate_single_defects(df)

df.to_csv('data/train_single_defects_with_blackness.csv', sep=',', index=False)

### Augmentations

In [None]:
import os
import albumentations
import cv2
import time
import random

# self-written scripts
import sys
sys.path.insert(0, 'Python_Scripts')

import data_preparation_cnn

In [None]:
def make_folder():
    # prepare folder structure
    try:
        path = os.getcwd()
        temp_path = path + "/data/augmentations"
        os.mkdir(temp_path)
    except:
        print('Folder already exists.')

In [None]:
import albumentations as A

augment = A.Compose([
    #A.VerticalFlip(p=0.5),
    A.HorizontalFlip(p=0.5),
    # A.OneOf([
    #     A.ElasticTransform(alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03, p=0.5),
    #     A.GridDistortion(p=0.5),
    #     A.OpticalDistortion(distort_limit=2, shift_limit=0.5, p=1)                  
    #     ], p=0.8),
    A.CLAHE(p=0.8),
    A.RandomBrightnessContrast(p=0.8),    
    A.RandomGamma(p=0.8)
])

In [None]:
def augement_images(image_ids, num_augmentations, class_id):
    print(f'beginning augmentation for ClassId {class_id}...')
    start = time.time()
    
    path = os.getcwd()
    #path_suffix = 'c' + str(class_id) + '/'
    
    target_directory_image = '/data/augmentations/'
    
    aug_ids = []
    class_ids = []
    file_paths = []
    
    i = 1
    
    while i <= num_augmentations:
        #print(i)
        number = random.randint(0, len(image_ids) -1)
        image_id = image_ids[number]
        #print(image_id, mask_id)
        
        aug_ids.append('aug_' + str(i) + '_' + image_id)
        class_ids.append(class_id)
        file_paths.append(path + target_directory_image + image_id)
        
        original_image = cv2.imread('data/train_images/' + image_id)
        #print(original_image)
      
        augmented = augment(image=original_image)
        transformed_image = augmented['image']
        #transformed_mask = augmented['mask']
        
        os.chdir(path + target_directory_image)
        written = cv2.imwrite('aug_' + str(i) + '_' + image_id, transformed_image)
        #print('image written:',written')

        os.chdir(path)
        
        i += 1
    
    temp = pd.DataFrame(list(zip(file_paths,aug_ids, class_ids)), columns=['FilePath','ImageId','ClassId'])
    
    end = time.time()
    print(f'augmented {num_augmentations} images of ClassId {class_id}')
    print('time required for augmentation:', end - start)
    print()
    
    return temp

In [None]:
def create_df_aug(df):
    
    make_folder()
    
    num_images_class_3 = df.groupby('ClassId').count().ImageId[3]
    max_images = num_images_class_3

    # create empty df
    df_aug = pd.DataFrame(columns=['FilePath','ImageId','ClassId'])

    for i in [1,2,3,4]:
        image_ids = df.query('ClassId == @i').ImageId.values

        temp = augement_images(image_ids=image_ids, num_augmentations=max_images, class_id=i)
        df_aug=pd.concat([df_aug, temp], axis=0)

    return df_aug.reset_index(drop=True)

---

### Augmentation for Train-Test-Split

In [None]:
df_sd = pd.read_csv('data/train_single_defects_with_blackness.csv')

In [None]:
X = df_sd.copy()
y = X.pop('ClassId')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = SEED)

In [None]:
# create data frame for train and test
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [None]:
"""Only execute ONCE"""
# apply augmentation to train images
df_train_aug = create_df_aug(df_train)

In [None]:
# save train-test-splits to .csv to feed them into the models
df_train_aug.to_csv('data/train_set_augmented.csv', sep=',', index=False)
df_test.to_csv('data/test_set_for_augmented.csv', sep=',', index=False)

---

## Histogramm of Oriented Gradients (HOG)

### Create HoG for all images in `train_images`

In [None]:
# get current working directory
cwd = pathlib.Path.cwd()
train_data_dir = cwd.joinpath('data', 'train_images')

train_images = list(train_data_dir.glob('*.jpg'))

In [None]:
hog_images = []
hog_features = []
Image_Ids = []
for image in train_images:
    Image_Ids.append(image.name)
    image = io.imread("data/train_images/"+image.name)
    resized_img = resize(image, (64,128))
#    blur = cv.GaussianBlur(image,(5,5),0)
    fd,hog_image = hog(resized_img, orientations=9, pixels_per_cell=(8,8),cells_per_block=(2,2),visualize=True,channel_axis=-1)
    hog_images.append(hog_image)
    hog_features.append(fd)
  
hog_features = np.array(hog_features)

hog_features.shape


Once, HOG-features are generated, we can build a data frame from it and save it as `.csv`.

In [None]:
hog_features= pd.DataFrame(hog_features)

Image_Ids= pd.DataFrame(Image_Ids)
Image_Ids.rename(columns={0: 'ImageId'}, inplace=True)

# put everything together
hog_complete = pd.concat([hog_features, Image_Ids], axis=1, ignore_index=False)

In [None]:
hog_complete.to_csv('data/train_HOG.csv', sep=',',index=False)

### Create HoG for all images in `train_single_defects_augmented`

In [None]:
# get current working directory
cwd = pathlib.Path.cwd()
train_data_dir2 = cwd.joinpath('data', 'augmentations')

train_images2 = list(train_data_dir2.glob('*.jpg'))

In [None]:
hog_images_augmented = []
hog_features_augmented = []
Image_Ids2 = []
for image in train_images2:
    Image_Ids2.append(image.name)
    image = io.imread("data/augmentations/"+image.name)
    resized_img = resize(image, (64,128))
    fd2,hog_image_augmented = hog(resized_img, orientations=9, pixels_per_cell=(8,8),cells_per_block=(2,2),visualize=True,channel_axis=-1)
    hog_images_augmented.append(hog_image_augmented)
    hog_features_augmented.append(fd2)
  
hog_features_augmented = np.array(hog_features_augmented)

hog_features_augmented.shape



In [None]:
hog_features_augmented= pd.DataFrame(hog_features_augmented)

Image_Ids2= pd.DataFrame(Image_Ids2)
Image_Ids2.rename(columns={0: 'ImageId'}, inplace=True)

hog_complete_augmented = pd.concat([hog_features_augmented, Image_Ids2], axis=1, ignore_index=False)

In [None]:
hog_complete_augmented.to_csv('data/train_HOG_augmented.csv', sep=',',index=False)

## with Gaussian blur

In [None]:
import cv2
hog_images_augmented_blur = []
hog_features_augmented_blur = []
Image_Ids2 = []
for image in train_images2:
    Image_Ids2.append(image.name)
    image = io.imread("data/augmentations/"+image.name)
    resized_img = resize(image, (64,128))
    blur = cv2.GaussianBlur(image,(5,5),0)
    fd2,hog_image_augmented_blur = hog(blur, orientations=9, pixels_per_cell=(8,8),cells_per_block=(2,2),visualize=True,channel_axis=-1)
    hog_images_augmented_blur.append(hog_image_augmented_blur)
    hog_features_augmented_blur.append(fd2)
  
hog_features_augmented_blur = np.array(hog_features_augmented_blur)

hog_features_augmented_blur.shape

In [None]:
hog_features_augmented_blur= pd.DataFrame(hog_features_augmented_blur)
hog_features_augmented_blur

In [None]:
Image_Ids2= pd.DataFrame(Image_Ids2)
Image_Ids2.rename(columns={0: 'ImageId'}, inplace=True)
Image_Ids2

In [None]:
hog_complete_augmented_blur = pd.concat([hog_features_augmented_blur, Image_Ids2], axis=1, ignore_index=False)
hog_complete_augmented_blur

In [None]:
hog_complete_augmented_blur.to_csv('data/train_HOG_augmented_blur.csv', sep=',',index=False)

In [None]:
for img in hog_images_augmented_blur[:5]:
    plt.imshow(img)
    plt.show()

---

## SURF

In [None]:
"""Processing time: ~3 minutes and 40 seconds.
"""

# get current working directory
cwd = pathlib.Path.cwd()
train_data_dir = cwd.joinpath('data', 'train_images')
train_images = list(train_data_dir.glob('*.jpg'))

# Create SURF object. You can specify params here or later.
# Here I set Hessian Threshold to 400
surf = cv2.xfeatures2d.SURF_create(400)

temp = surf_handling.build_keypoints_from_list(train_images, surf)

In [None]:
print(f"We have {temp.query('NumberKP < 50').count()[0]} keypoint vectors with less than 50 keypoints")

In [None]:
temp.sort_values(by='NumberKP', ascending=False)

Adjust data frame and eliminate images that have more than 1 defect.

In [None]:
util.isolate_single_defects(temp)

---

### Prepare data frame with (max) TOP50 Keypoints per image

In [None]:
"""Expected run-time: ~ 35 minutes
"""
# apply functions to data frame
temp = surf_handling.add_keypoint_parameters(temp)

---

### Save data frame for further processing

In [None]:
temp.to_csv('data/train_surf.csv', sep=',', index=False)