In [9]:
from facenet_pytorch import MTCNN, InceptionResnetV1
from data import LipReadingData
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from PIL import Image
import torchvision.transforms.functional as F
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
mtcnn = MTCNN(
    image_size=160, margin=0, min_face_size=20,
    thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True
)

# # Create an inception resnet (in eval mode):
resnet = InceptionResnetV1(pretrained='vggface2').eval()

In [4]:
training_data = LipReadingData('data/labels.csv', 'data/images')
train_dataloader = DataLoader(training_data, batch_size=1, shuffle=False)    

In [5]:
len(training_data)

1405

In [11]:
# Calculate embedding matrix
num_images = 300 #Run to 1405 given more time
X = np.zeros((num_images, 512))
y = np.zeros(num_images)
for i, (img, label) in enumerate(iter(train_dataloader)):
    if i >= num_images:
        break
    img = F.to_pil_image(img.squeeze())
    img_cropped = mtcnn(img) #TODO: fix image saving
    img_embedding = resnet(img_cropped.unsqueeze(0))
    X[i, :] = img_embedding.detach().numpy()
    y[i] = label[0]
print(X)

[[ 0.03505049  0.01917475  0.05368248 ... -0.00045949  0.01423905
  -0.02000737]
 [ 0.02781379  0.01840598  0.05715199 ... -0.01897375  0.01132588
  -0.02340254]
 [ 0.03245189  0.01203464  0.04270403 ... -0.01583792  0.00617604
  -0.01759676]
 ...
 [ 0.00022129  0.05210192  0.0401425  ... -0.0315658   0.03115914
  -0.02125592]
 [-0.00865102  0.05984825  0.04916072 ... -0.02013059  0.02772941
  -0.01559018]
 [ 0.0019372   0.05788144  0.04553071 ... -0.01368961  0.03738432
  -0.00693679]]


In [13]:
print(y)

[0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.
 0. 0. 0. 0.]


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)
predictions = pipe.predict(X_test)
print('Accuracy:', score)

Accuracy: 0.72
