# Import libraries

In [2]:
from tqdm import tqdm
import numpy as np
import os
import pandas as pd
 
# Importing sklearn libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import preprocessing
 
# Importing Keras libraries
import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.applications import VGG16
from keras.applications import imagenet_utils
from keras.callbacks import ModelCheckpoint
from keras.preprocessing import image
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.layers import Dense, Conv2D, MaxPooling2D
from keras.layers import Dropout, Flatten, GlobalAveragePooling2D
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.layers import merge, Input


# Preprocessing the data using vgg16 and extracting features

In [2]:
image_input = Input(shape=(218,178,3))

#include_top=False as our input shape is different from the default for the VGG16 model
model = VGG16(include_top=False,weights="imagenet",input_tensor=image_input)

#print VGG16 model summary below
model.summary()

p = os.getcwd()
print(p)

os.chdir("..")
d = os.getcwd()
print(d)

dirname = os.path.dirname(d)
csvfile = os.path.join(d, 'datasets/celeba/labels.csv')
dirname = os.path.dirname(d)
imgfile = os.path.join(d, 'datasets/celeba/img/')

#use pandas to read CSV file
dataset = pd.read_csv(csvfile, sep='\\t', engine='python')

#preprocess data to input into the VGG16 model
vgg16_feature_list = []
for i in tqdm(range(dataset.shape[0])):
    img = image.load_img(imgfile+dataset['img_name'][i])
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)

#conver feature list to NumPy array    
    vgg16_feature = model.predict(img)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())


vgg16_feature_list_np = np.array(vgg16_feature_list)

vgg16_feature_list_np.shape

  0%|          | 0/5000 [00:00<?, ?it/s]

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 218, 178, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 218, 178, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 218, 178, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 109, 89, 64)       0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 109, 89, 128)      73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 109, 89, 128)      147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 54, 44, 128)       0     

100%|██████████| 5000/5000 [26:13<00:00,  3.18it/s]


(5000, 15360)

# Splitting the data into 75% train and 25% test set

In [3]:
# Label X and Y data
X = np.array(vgg16_feature_list)
y = np.array(dataset['gender']+1)/2

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=100)

# Optimising the solver

In [5]:
#Evaluate using 'lbfgs' solver
model = LogisticRegression(solver = 'lbfgs', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 93.44%


In [6]:
#Evaluate using 'newton-cg' solver
model = LogisticRegression(solver = 'newton-cg', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 93.60%




In [7]:
# Evaluate using 'liblinear' solver
model = LogisticRegression(solver = 'liblinear', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 93.12%


In [1]:
#'lbfgs' solver selected as newton-cg didn't converge

# Optimising the C value

In [19]:
# Evaluate using C=10
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=10)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 93.36%


In [20]:
# Evaluate using C=1.0
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=1.0)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 93.44%


In [21]:
# Evaluate using C=0.1
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 93.76%


In [22]:
# Evaluate using C=0.01
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.01)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 93.68%


In [None]:
#C value of 0.01 selected

# Cross validation of validation set accuracy

In [23]:
#KFold CV using 5 splits
kfold = KFold(n_splits=5, random_state=0, shuffle=True)
model_kfold = LogisticRegression(solver = 'lbfgs', max_iter=30000, C=0.1)
results_kfold = cross_val_score(model_kfold, X, y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))

Accuracy: 92.74%


# Finding the test set accuracy

In [26]:
image_input = Input(shape=(218,178,3))

model = VGG16(include_top=False,weights="imagenet",input_tensor=image_input)

model.summary()

dirname = os.path.dirname(d)
csvtest = os.path.join(d, 'test/celeba_test/labels.csv')
dirname = os.path.dirname(d)
imgtest = os.path.join(d, 'test/celeba_test/img/')

test_dataset = pd.read_csv(csvtest, sep='\\t', engine='python')

vgg16_test_feature_list = []
for i in tqdm(range(test_dataset.shape[0])):
    img = image.load_img(imgtest+test_dataset['img_name'][i])
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)

    vgg16_feature = model.predict(img)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_test_feature_list.append(vgg16_feature_np.flatten())


vgg16_test_feature_list_np = np.array(vgg16_test_feature_list)

vgg16_test_feature_list_np.shape

  0%|          | 0/1000 [00:00<?, ?it/s]

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 218, 178, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 218, 178, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 218, 178, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 109, 89, 64)       0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 109, 89, 128)      73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 109, 89, 128)      147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 54, 44, 128)       0     

100%|██████████| 1000/1000 [04:14<00:00,  3.93it/s]


(1000, 15360)

In [30]:
# Labelling the X and Y test data
X_test = np.array(vgg16_test_feature_list)
Y_test = np.array(test_dataset['gender']+1)/2

In [29]:
#Using the model with the optimised paramaters
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 93.10%


# Finding the train set accuracy

In [31]:
#Using the model with the optimised paramaters to find train set accuracy
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_train, Y_train)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 100.00%
