# Import libraries

In [1]:
import pandas as pd
import cv2
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
import numpy as np
from tqdm import tqdm
import keras
from keras.preprocessing import image
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Load csv file

In [2]:
p = os.getcwd()
print(p)

os.chdir("..")
d = os.getcwd()
print(d)

dirname = os.path.dirname(d)
csvfile = os.path.join(d, 'datasets/cartoon_set/labels.csv')
dirname = os.path.dirname(d)
imgfile = os.path.join(d, 'datasets/cartoon_set/img/')

dataset = pd.read_csv(csvfile, sep='\\t', engine='python')

dataset.head()

/Users/mel/Documents/MRes/Machine Learning/Assignment/AMLS_20-21_SN12345678/B1
/Users/mel/Documents/MRes/Machine Learning/Assignment/AMLS_20-21_SN12345678


Unnamed: 0,eye_color,face_shape,file_name
0,1,4,0.png
1,2,4,1.png
2,2,3,2.png
3,2,0,3.png
4,0,2,4.png


# Load and preprocess data

In [3]:
dataset_image = []
for i in tqdm(range(dataset.shape[0])):
    img = image.load_img(imgfile+dataset['file_name'][i], target_size=(50,50))
    img = img.convert('L')
    img = image.img_to_array(img)
    img = img/255
    img = img.flatten() 

    dataset_image.append(img)

100%|██████████| 10000/10000 [01:54<00:00, 87.64it/s]


# Splitting data into 75% train and 25% test

In [4]:
# Label the data
X = np.array(dataset_image)
Y = dataset['face_shape']

# Split the data into training and testing(75% training and 25% testing data)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=100)

# Optimising the solver

In [5]:
#Accuracy using lbfgs solver
model = LogisticRegression(solver = 'lbfgs', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 99.64%


In [6]:
#Accuracy using newton-cg solver
model = LogisticRegression(solver = 'newton-cg', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 99.64%




In [7]:
#Accuracy using liblinear solver
model = LogisticRegression(solver = 'liblinear', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 99.64%


# Optimising the C value

In [19]:
#Accuracy using C value of 10
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=10)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 99.84%


In [20]:
#Accuracy using C value of 1.0
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=1.0)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 99.64%


In [21]:
#Accuracy using C value of 0.1
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 99.04%


In [22]:
#Accuracy using C value of 0.01
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.01)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 97.64%


# KFold cross validation on validation set

In [23]:
#KFold with 5 splits
kfold = KFold(n_splits=5, random_state=0, shuffle=True)
model_kfold = LogisticRegression(solver = 'liblinear', max_iter=10000, C=10)
results_kfold = cross_val_score(model_kfold, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))

Accuracy: 99.80%


# Accuracy on train set

In [8]:
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=10)
model.fit(X_train, Y_train)
result = model.score(X_train, Y_train)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 100.00%


# Accuracy on test set

In [9]:
dirname = os.path.dirname(d)
csvtest = os.path.join(d, 'test/cartoon_set_test/labels.csv')
dirname = os.path.dirname(d)
imgtest = os.path.join(d, 'test/cartoon_set_test/img/')


dataset = pd.read_csv(csvtest, sep='\\t', engine='python')

dataset.head()

Unnamed: 0,eye_color,face_shape,file_name
0,2,1,0.png
1,1,4,1.png
2,0,2,2.png
3,0,2,3.png
4,3,3,4.png


In [10]:
test_dataset_image = []
for i in tqdm(range(test_dataset.shape[0])):
    img = image.load_img(imgtest+test_dataset['file_name'][i], target_size=(50,50))
    img = img.convert('L')
    img = image.img_to_array(img)
    img = img/255
    img = img.flatten() 

    test_dataset_image.append(img)

100%|██████████| 2500/2500 [00:29<00:00, 84.41it/s]


In [11]:
# Label the data
X_TEST = np.array(test_dataset_image)
Y_TEST = test_dataset['face_shape']

In [12]:
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=10)
model.fit(X_train, Y_train)
result = model.score(X_TEST, Y_TEST)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 99.92%
