# Import libraries

In [14]:
from tqdm import tqdm
import numpy as np
import pandas as pd
 
# Importing sklearn libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
#from sklearn import datasets
from sklearn import preprocessing
 
# Importing Keras libraries
import keras
#from keras.utils import np_utils
#from keras.models import Sequential
from keras.applications import VGG16
#from keras.applications import imagenet_utils
#from keras.callbacks import ModelCheckpoint
from keras.preprocessing import image
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
#from keras.layers import Dense, Conv2D, MaxPooling2D
#from keras.layers import Dropout, Flatten, GlobalAveragePooling2D
from keras.applications.vgg16 import preprocess_input
from keras.layers import merge, Input

# Preprocessing data using VGG16 and extracting features

In [3]:
image_input = Input(shape=(224,224,3))

model = VGG16(include_top=False,weights="imagenet",input_tensor=image_input)

model.summary()

dataset = pd.read_csv('/Users/mel/Documents/MRes/Machine Learning/Assignment/AMLS_20-21_SN12345678/datasets/cartoon_set/labels.csv', sep='\\t', engine='python')

vgg16_feature_list = []
for i in tqdm(range(dataset.shape[0])):
    img = image.load_img('/Users/mel/Documents/MRes/Machine Learning/Assignment/AMLS_20-21_SN12345678/datasets/cartoon_set/img/'+dataset['file_name'][i], target_size=(224,224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)

    vgg16_feature = model.predict(img)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())

vgg16_feature_list_np = np.array(vgg16_feature_list)

vgg16_feature_list_np.shape

  0%|          | 0/10000 [00:00<?, ?it/s]

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

100%|██████████| 10000/10000 [54:22<00:00,  3.07it/s] 


(10000, 25088)

# Load and split data

In [4]:
# Load the data
X = np.array(vgg16_feature_list)
y = dataset['eye_color']
#X = X.astype('int')
#y = y.astype('int')

#Split the date into 75% train and 25% test (validation)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=100)

# Optimise solver

In [6]:
#Evaluate using 'lbfgs' solver
model = LogisticRegression(solver = 'lbfgs', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 73.68%


In [7]:
#Evaluate using 'newton-cg' solver
model = LogisticRegression(solver = 'newton-cg', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))



Accuracy: 73.76%


In [8]:
#Evaluate using 'liblinear' solver
model = LogisticRegression(solver = 'liblinear', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 74.12%


# Optimise C value

In [9]:
#Evaluate using C=10
model = LogisticRegression(solver = 'liblinear', max_iter=10000, C=10)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 74.08%


In [10]:
#Evaluate using C=1.0
model = LogisticRegression(solver = 'liblinear', max_iter=10000, C=1.0)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 74.12%


In [11]:
#Evaluate using C=0.1
model = LogisticRegression(solver = 'liblinear', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 74.24%


In [12]:
#Evaluate using C=0.01
model = LogisticRegression(solver = 'liblinear', max_iter=10000, C=0.01)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 74.32%


# Evaluate validation set using cross validation techniques

In [15]:
#Evaluate using KFold with 5 splits
kfold = KFold(n_splits=5, random_state=0, shuffle=True)
model_kfold = LogisticRegression(solver = 'liblinear', max_iter=10000, C=0.01)
results_kfold = cross_val_score(model_kfold, X, y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))

Accuracy: 75.72%


# Find the test set accuracy

In [17]:
image_input = Input(shape=(224,224,3))

model = VGG16(include_top=False,weights="imagenet",input_tensor=image_input)

model.summary()

test_dataset = pd.read_csv('/Users/mel/Documents/MRes/Machine Learning/Assignment/AMLS_20-21_SN12345678/test/cartoon_set_test/labels.csv', sep='\\t', engine='python')

vgg16_test_feature_list = []
for i in tqdm(range(test_dataset.shape[0])):
    img = image.load_img('/Users/mel/Documents/MRes/Machine Learning/Assignment/AMLS_20-21_SN12345678/test/cartoon_set_test/img/'+test_dataset['file_name'][i], target_size=(224,224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)

    vgg16_feature = model.predict(img)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_test_feature_list.append(vgg16_feature_np.flatten())


vgg16_test_feature_list_np = np.array(vgg16_test_feature_list)

vgg16_test_feature_list_np.shape

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

100%|██████████| 2500/2500 [19:19<00:00,  2.16it/s]    


(2500, 25088)

In [18]:
# Labelling the X and Y test data
X_TEST = np.array(vgg16_test_feature_list)
Y_TEST = test_dataset['eye_color']

In [19]:
#Using the model with the optimised paramaters
model = LogisticRegression(solver = 'liblinear', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_TEST, Y_TEST)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 74.76%


# Find the train set accuracy

In [20]:
#Using the model with the optimised paramaters to find train set accuracy
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_train, Y_train)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 100.00%
