In [None]:
import csv
import os
import cv2
import numpy as np
import pandas as pd
import imgaug.augmenters as iaa
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

### Safe the filenames to the csv file and read it

In [None]:
# safe the filenames of data to the csv file
with open("dataset_3.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["filename", "ground_truth"])
for cat, ans in zip(["not_my_cat", "my_cat"], ["0", "1"]):
    filenames = os.listdir("/Users/chunpei/AIC/HW1/" + cat + "/")
    with open("dataset_3.csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        for filename in filenames:
            writer.writerow([cat +"/" + filename, ans])

# read the csv file
data = pd.read_csv(f'/kaggle/input/dataset/dataset_3.csv')

# shuffle the data 
data = shuffle(data)

# split the dataset
train_data, test_data = train_test_split(data, train_size=0.5, random_state=42)
print(train_data.shape)
print(test_data.shape)

### Preprocess the data

In [None]:
# different amounts of training data
# train_data = train_data.sample(frac=0.5)

# preprocess the data

# data augmentation
seq = iaa.Sequential([
    iaa.Fliplr(p=0.5), # horizontally flip 50% of all images
#     iaa.Flipud(p=0.5), # vertically flip 50% of all images
    iaa.Affine(rotate=(-20, 20), mode='symmetric'), # rotate the images between -20 and 20 degrees, use symmetric padding mode
    iaa.Crop(percent=(0, 0.2)), # crop images by 0 to 20% of their height/width
#     iaa.AddToHueAndSaturation(value=(-30, 30)) # change hue and saturation by -30 to 30
])

# read the image files
x_train = []
y_train = []
for i, row in train_data.iterrows(): # iterate each row
    img = cv2.imread("/kaggle/input/aic-hw1-3/" + row['filename'])
    img = cv2.resize(img, (100, 100)) # resize the image size
    x_train.append(img)
    y_train.append(row['ground_truth'])
    for j in range(3): # generate x augmented images per original image
        img_aug = seq(image=img)
        img_aug = cv2.resize(img_aug, (100, 100))
        x_train.append(img_aug)
        y_train.append(row['ground_truth'])
x_train = np.array(x_train)
y_train = np.array(y_train)
print(x_train.shape)

x_test = []
y_test = []
for i, row in test_data.iterrows():
    img = cv2.imread("/kaggle/input/aic-hw1-3/" + row['filename'])
    img = cv2.resize(img, (100, 100))
    x_test.append(img)
    y_test.append(row['ground_truth'])
x_test = np.array(x_test)
y_test = np.array(y_test)
print(x_test.shape)

### ResNet

In [None]:
# define the model
base_model = ResNet50(weights="imagenet", include_top=False, input_shape=(100, 100, 3))
x = base_model.output
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
predictions = Dense(1, activation='sigmoid')(x)
model = Model(inputs=base_model.input, outputs=predictions)

# freeze the weight of ResNet50 
for layer in base_model.layers:
    layer.trainable = False

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# define KFold (safe the index of each fold)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
train_index_list = []
valid_index_list = []
for train_index, valid_index in kfold.split(x_train, y_train):
    train_index_list.append(train_index)
    valid_index_list.append(valid_index)

# cross validation
for i in range(kfold.get_n_splits()):
    print(f"Fold {i+1}")
    x_train_fold = x_train[train_index_list[i]]
    y_train_fold = y_train[train_index_list[i]]
    x_valid_fold = x_train[valid_index_list[i]]
    y_valid_fold = y_train[valid_index_list[i]]
    model.fit(x_train_fold, y_train_fold, validation_data=(x_valid_fold, y_valid_fold), epochs=5)
    valid_loss, valid_acc = model.evaluate(x_valid_fold, y_valid_fold)
    print(f"Valid accuracy: {valid_acc}")
    # reset the weights
    model.set_weights(model.get_weights())

test_loss, test_acc = model.evaluate(x_test, y_test)
test_pred = model.predict(x_test)
test_pred_classes = (test_pred > 0.5).astype(int)
test_precision = precision_score(y_test, test_pred_classes)
test_recall = recall_score(y_test, test_pred_classes)
test_f1 = f1_score(y_test, test_pred_classes)
print(f"Test accuracy: {test_acc:.3f}")
print(f"Test precision: {test_precision:.3f}")
print(f"Test recall: {test_recall:.3f}")
print(f"Test F1-score: {test_f1:.3f}")

### LogisticRegression, KNN, SVM, Decision Tree

In [None]:
# # logisticRegression model
# model = LogisticRegression(penalty='l1', C=0.01, solver='liblinear')

# # KNN model
# # model = KNeighborsClassifier(n_neighbors=5)

# # SVM model
# # model = SVC(kernel='linear', C=0.01, probability=True)

# # DesisionTree
# # model = DecisionTreeClassifier(max_depth=5, random_state=42)

# # cross validation
# cv_results = cross_validate(model, x_train, y_train, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])

# # print scores
# for i in range(5):
#     print(f"Fold {i+1} - accuracy: {cv_results['test_accuracy'][i]:.3f}, Precision: {cv_results['test_precision'][i]:.3f}, Recall: {cv_results['test_recall'][i]:.3f}, F1: {cv_results['test_f1'][i]:.3f}")
# print(f"\nValidation: Average accuracy: {cv_results['test_accuracy'].mean():.3f}, Average Precision: {cv_results['test_precision'].mean():.3f}, Average Recall: {cv_results['test_recall'].mean():.3f}, Average F1: {cv_results['test_f1'].mean():.3f}\n")

# # train model on all training data
# model.fit(x_train, y_train)

# # predict test data
# y_pred = model.predict(x_test)

# # calculate and print the results
# accuracy = metrics.accuracy_score(y_test, y_pred)
# precision = metrics.precision_score(y_test, y_pred, average='weighted')
# recall = metrics.recall_score(y_test, y_pred, average='weighted')
# f1 = metrics.f1_score(y_test, y_pred, average='weighted')
# print("Test data:")
# print(f'Accuracy: {accuracy:.3f}')
# print(f'Precision: {precision:.3f}')
# print(f'Recall: {recall:.3f}')
# print(f'F1: {f1:.3f}')