# Import libraries

In [2]:
import pandas as pd
import cv2
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
import numpy as np
from tqdm import tqdm
import keras
from keras.preprocessing import image
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Load csv file

In [3]:
p = os.getcwd()
print(p)

os.chdir("..")
d = os.getcwd()
print(d)

dirname = os.path.dirname(d)
csvfile = os.path.join(d, 'datasets/celeba/labels.csv')
dirname = os.path.dirname(d)
imgfile = os.path.join(d, 'datasets/celeba/img/')

#Read dataset
dataset = pd.read_csv(csvfile, sep='\\t', engine='python')

dataset.head()

/Users/mel/Documents/MRes/Machine Learning/Assignment/AMLS_20-21_SN12345678/A1
/Users/mel/Documents/MRes/Machine Learning/Assignment/AMLS_20-21_SN12345678


Unnamed: 0,img_name,gender,smiling
0,0.jpg,-1,1
1,1.jpg,-1,1
2,2.jpg,1,-1
3,3.jpg,-1,-1
4,4.jpg,-1,-1


# Loading and preprocessing data

In [12]:
dataset_image = []
for i in tqdm(range(dataset.shape[0])):
    img = image.load_img(imgfile+dataset['img_name'][i], target_size=(50,50))
    img = img.convert('L')
    img = image.img_to_array(img)
    img = img/255
    img = img.flatten() 

    dataset_image.append(img)


100%|██████████| 5000/5000 [00:05<00:00, 958.16it/s] 


# Splitting the data into 75% train and 25% test set

In [13]:
X = np.array(dataset_image)
X.shape

(5000, 2500)

In [14]:
Y = (dataset['gender']+1)/2

In [15]:
# Evaluate using a train and a test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=100)

# Optimising the solver

In [10]:
#Evaluate using 'lbfgs' solver
model = LogisticRegression(solver = 'lbfgs', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 86.64%


In [13]:
#Evaluate using 'newton-cg' solver
model = LogisticRegression(solver = 'newton-cg', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 86.64%




In [14]:
#Evaluate using 'liblinear' solver
model = LogisticRegression(solver = 'liblinear', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 86.96%


# Optimising the C value

In [15]:
#Evaluate using C=1.0
model = LogisticRegression(solver = 'liblinear', max_iter=10000, C=1.0)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 86.96%


In [16]:
#Evaluate using C= 0.1
model = LogisticRegression(solver = 'liblinear', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 88.08%


In [17]:
#Evaluate using C=0.01
model = LogisticRegression(solver = 'liblinear', max_iter=10000, C=0.01)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 84.00%


# Evaluate validation set accuracy using cross-validation techniques

In [19]:
#Evaluate using KFold with 5 splits
kfold = KFold(n_splits=5, random_state=0, shuffle=True)
model_kfold = LogisticRegression(solver = 'liblinear', max_iter=10000, C=0.1)
results_kfold = cross_val_score(model_kfold, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))

Accuracy: 87.20%


In [21]:
#Evaluate using KFold with 10 splits
kfold = KFold(n_splits=10, random_state=0, shuffle=True)
model_kfold = LogisticRegression(solver = 'liblinear', max_iter=10000, C=0.1)
results_kfold = cross_val_score(model_kfold, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))

Accuracy: 87.68%


# Finding the accuracy on the test set using the optimised model

In [4]:
dirname = os.path.dirname(d)
csvtest = os.path.join(d, 'test/celeba_test/labels.csv')
dirname = os.path.dirname(d)
imgtest = os.path.join(d, 'test/celeba_test/img/')

test_dataset = pd.read_csv(csvtest, sep='\\t', engine='python')

test_dataset.head()

Unnamed: 0,img_name,gender,smiling
0,0.jpg,-1,-1
1,1.jpg,-1,1
2,2.jpg,1,1
3,3.jpg,1,1
4,4.jpg,-1,-1


In [4]:
test_dataset_image = []
for i in tqdm(range(test_dataset.shape[0])):
    img = image.load_img(imgtest+test_dataset['img_name'][i], target_size=(50,50))
    img = img.convert('L')
    img = image.img_to_array(img)
    img = img/255
    img = img.flatten() 

    test_dataset_image.append(img)

100%|██████████| 1000/1000 [00:03<00:00, 326.50it/s]


In [5]:
X_TEST = np.array(test_dataset_image)
Y_TEST = np.array(test_dataset['gender']+1)/2

In [16]:
model = LogisticRegression(solver = 'liblinear', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_TEST, Y_TEST)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 88.40%


# Finding the accuracy on the train set using the optimised model

In [33]:
result = model.score(X_train, Y_train)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 97.07%
