# Import libraries

In [6]:
from tqdm import tqdm
import numpy as np
import pandas as pd
 
# Importing sklearn libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import preprocessing
 
# Importing Keras libraries
import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.applications import VGG16
from keras.applications import imagenet_utils
from keras.callbacks import ModelCheckpoint
from keras.preprocessing import image
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.layers import Dense, Conv2D, MaxPooling2D
from keras.layers import Dropout, Flatten, GlobalAveragePooling2D
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.layers import merge, Input


# Import and preprocess data

In [7]:
image_input = Input(shape=(218,178,3))

model = VGG16(include_top=False,weights="imagenet",input_tensor=image_input)

model.summary()

p = os.getcwd()
print(p)

os.system(p)
os.chdir("..")
d = os.getcwd()
print(d)

dirname = os.path.dirname(d)
csvfile = os.path.join(d, 'datasets/celeba/labels.csv')
dirname = os.path.dirname(d)
imgfile = os.path.join(d, 'datasets/celeba/img/')


dataset = pd.read_csv(csvfile, sep='\\t', engine='python')

vgg16_feature_list = []
for i in tqdm(range(dataset.shape[0])):
    img = image.load_img(imgfile+dataset['img_name'][i])
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)

    vgg16_feature = model.predict(img)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())


vgg16_feature_list_np = np.array(vgg16_feature_list)

vgg16_feature_list_np.shape

  0%|          | 0/5000 [00:00<?, ?it/s]

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 218, 178, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 218, 178, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 218, 178, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 109, 89, 64)       0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 109, 89, 128)      73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 109, 89, 128)      147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 54, 44, 128)       0     

100%|██████████| 5000/5000 [19:49<00:00,  4.20it/s]


(5000, 15360)

# Label and split data

In [8]:
# Label the data
X = np.array(vgg16_feature_list)
y = np.array(dataset['smiling']+1)/2

In [9]:
# Evaluate using a train and a validation set
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=100)

# Optimise solver

In [4]:
#Accuracy using lbfgs solver
model = LogisticRegression(solver = 'lbfgs', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 76.96%


In [5]:
#Accuracy using newton-cg solver
model = LogisticRegression(solver = 'newton-cg', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 76.96%




In [6]:
# Accuracy using liblinear solver
model = LogisticRegression(solver = 'liblinear', max_iter=10000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 76.16%


# Optimise C value

In [7]:
# Accuracy with C value of 10
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=10)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 76.80%


In [8]:
# Accuracy with C value of 1.0
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=1.0)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 76.96%


In [9]:
# Accuracy with C value of 0.1
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.1)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 77.60%


In [10]:
# Accuracy with C value of 0.01
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.01)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 78.32%


# Accuracy of validation set using KFold CV

In [18]:
#KFold cross validation with 5 splits
kfold = KFold(n_splits=5, random_state=0, shuffle=True)
model_kfold = LogisticRegression(solver = 'lbfgs', max_iter = 20000, C=0.01)
results_kfold = cross_val_score(model_kfold, X, y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))

Accuracy: 78.24%


In [19]:
#KFold cross validation with 10 splits
kfold = KFold(n_splits=10, random_state=0, shuffle=True)
model_kfold = LogisticRegression(solver = 'lbfgs', max_iter = 20000, C=0.01)
results_kfold = cross_val_score(model_kfold, X, y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))

Accuracy: 78.70%


In [20]:
#Stratified KFold cross validation with 3 splits
skfold = StratifiedKFold(n_splits=3, random_state=100, shuffle=True)
model_skfold = LogisticRegression(solver = 'lbfgs', max_iter = 20000, C=0.01)
results_skfold = cross_val_score(model_skfold, X, y, cv=skfold)
print("Accuracy: %.2f%%" % (results_skfold.mean()*100.0))

Accuracy: 78.10%


# Finding train set accuracy

In [21]:
#Using the model with the optimised paramaters to find train set accuracy
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.01)
model.fit(X_train, Y_train)
result = model.score(X_train, Y_train)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 100.00%


# Accuracy of test set

In [11]:
image_input = Input(shape=(218,178,3))

model = VGG16(include_top=False,weights="imagenet",input_tensor=image_input)

model.summary()

dirname = os.path.dirname(d)
csvtest = os.path.join(d, 'test/celeba_test/labels.csv')
dirname = os.path.dirname(d)
imgtest = os.path.join(d, 'test/celeba_test/img/')

test_dataset = pd.read_csv(csvtest, sep='\\t', engine='python')

vgg16_test_feature_list = []
for i in tqdm(range(test_dataset.shape[0])):
    img = image.load_img(imgtest+test_dataset['img_name'][i])
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)

    vgg16_feature = model.predict(img)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_test_feature_list.append(vgg16_feature_np.flatten())


vgg16_test_feature_list_np = np.array(vgg16_test_feature_list)

vgg16_test_feature_list_np.shape

  0%|          | 0/1000 [00:00<?, ?it/s]

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 218, 178, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 218, 178, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 218, 178, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 109, 89, 64)       0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 109, 89, 128)      73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 109, 89, 128)      147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 54, 44, 128)       0     

100%|██████████| 1000/1000 [04:08<00:00,  4.03it/s]


(1000, 15360)

In [16]:
# Labelling the X and Y test data
X_TEST = np.array(vgg16_test_feature_list)
Y_TEST = np.array(test_dataset['smiling']+1)/2

In [17]:
#Using the model with the optimised paramaters
model = LogisticRegression(solver = 'lbfgs', max_iter=10000, C=0.01)
model.fit(X_train, Y_train)
result = model.score(X_TEST, Y_TEST)
print("Accuracy: %.2f%%" % (result*100.0))

Accuracy: 78.10%
