# This notebook is going over and trying to recreate the model used the paper LIPNET: END-TO-END SENTENCE-LEVEL LIPREADING. this notebook was also inspired by Nicholas Renotte

## in this we are only using one speaker in the grid data set. This will work for more speakers, but the time to train the model would take too long. This already took 6 min an epoch using a gpu on kaggle for only data for one speaker

In [3]:
# importing required libraries
import numpy as np 
import pandas as pd 
import os
from tensorflow.nn import ctc_loss
import tensorflow as tf
from keras.models import load_model
import cv2
import keras
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Input,Conv3D, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, BatchNormalization,GRU, TimeDistributed, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint


In [4]:
# this function takes a video and loads the images. It also crops around the lips with a set range of pixels
# in the paper they use a lip detector to extract the lips. This is something that isn't implemented in this notebook
    outputframes = []
    video = cv2.VideoCapture(path)
    countframes = video.get(cv2.CAP_PROP_FRAME_COUNT)
    for i in range(int(countframes)):
        video.set(cv2.CAP_PROP_POS_FRAMES, 1)
        ret, frame = video.read()
        frame = tf.image.rgb_to_grayscale(frame)

        outputframes.append(tf.image.per_image_standardization(frame[190:236,80:220,:]))
    return outputframes

In [5]:
# creating the vocabulary
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]

In [6]:
# these are creating a way to transform a charcter to a number and number to letter
characternum = tf.keras.layers.StringLookup(vocabulary = vocab,oov_token="")
numcharacter = tf.keras.layers.StringLookup(vocabulary = vocab,oov_token="",invert = True)

2023-02-18 02:50:16.304196: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-18 02:50:16.400626: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-18 02:50:16.401414: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-18 02:50:16.403000: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [7]:
# this is loading the aligned labels for what the speaker is saying
# we get rid of the parts for silence
def load_align(path):
    #print(path)
    with open(path, 'r') as f: 
        lines = f.readlines() 
    sentence = []
    
    for line in lines:
        L =line.split()[2]
        if L == 'sil':
            continue
        else:
            if len(sentence)>1:
                sentence.append(' ')
            for j in L:
                sentence.append(j)
    return characternum(sentence)


In [8]:
# takes in video file name and ruetruns the frames and the alignments of words
def load(path):
    
    finalpath =bytes.decode(path.numpy())
   
    filename = finalpath.split('/')[-1]
    filename = filename.split('.')[0]
    
    framepath = os.path.join('/kaggle','input','grid-1-speaker','s4.mpg_vcd','s4',finalpath)
    alignpath = os.path.join('/kaggle','input','grid-1-speaker','s4','align',filename+'.align')
    frames = load_frames(framepath)
    align = load_align(alignpath)
    return frames, align

In [9]:
# this is needed to create the load function to work with tensorflow
def final_function(path:str):
    return tf.py_function(func=load, inp=[path], Tout=(tf.float32,tf.int64))

In [10]:
# this takes the data and shuffles, then runs our load function
# then makes a batch of size 2 and prefetchs some data for faster training.
# saving 450 videos for training and the rest for testing
data = tf.data.Dataset.list_files('/kaggle/input/grid-1-speaker/s4.mpg_vcd/s4/*.mpg', shuffle=None, seed=42)
data = data.shuffle(500, reshuffle_each_iteration=False)
data = data.map(final_function)
data = data.padded_batch(2, padded_shapes=([75,None,None,None],[40]))
data = data.prefetch(tf.data.AUTOTUNE)
# Added for split 
train = data.take(450)
test = data.skip(450)

In [11]:
frames, alignments = data.as_numpy_iterator().next()
frames.shape

2023-02-18 02:50:19.689577: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


(2, 75, 46, 140, 1)

In [12]:
# this is for the ctc loss. This code was copied from an example on the tensorflow website.
# this loss is needed since our model will predict characters from our vocab for 
# each of the 75 frames. but the real alignments don't have 75 characters
# thus this loss deals with this difference
def CTCLoss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss


In [13]:
# this part is here to load a partialy trained model. Since it took a long time
# i needed to train over multiple sessions. 
model = load_model('/kaggle/input/final-lip-read/saved-model-v3-45.hdf5',compile=False)
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv3d_9 (Conv3D)            (None, 75, 46, 140, 32)   832       
_________________________________________________________________
activation_9 (Activation)    (None, 75, 46, 140, 32)   0         
_________________________________________________________________
max_pooling3d_9 (MaxPooling3 (None, 75, 23, 70, 32)    0         
_________________________________________________________________
conv3d_10 (Conv3D)           (None, 75, 23, 70, 64)    51264     
_________________________________________________________________
activation_10 (Activation)   (None, 75, 23, 70, 64)    0         
_________________________________________________________________
max_pooling3d_10 (MaxPooling (None, 75, 11, 35, 64)    0         
_________________________________________________________________
conv3d_11 (Conv3D)           (None, 75, 11, 35, 96)   

In [14]:
# this is the code for creating the model like the one from the paper
'''model = Sequential()
model.add(Input(shape=(75,46,140,1)))
model.add(Conv3D(32,kernel_size = (5,5,1),padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(64,kernel_size = (5,5,1),padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(96,kernel_size = (3,3,1),padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))


model.add(TimeDistributed(Flatten()))
model.add(GRU(256,recurrent_initializer='orthogonal',return_sequences=True))
model.add(Dropout(0.5))

model.add(GRU(256,recurrent_initializer='orthogonal',return_sequences=True))
model.add(Dropout(0.5))

model.add(Dense(characternum.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))

model.summary()
'''

"model = Sequential()\nmodel.add(Input(shape=(75,46,140,1)))\nmodel.add(Conv3D(32,kernel_size = (5,5,1),padding='same'))\nmodel.add(Activation('relu'))\nmodel.add(MaxPool3D((1,2,2)))\n\nmodel.add(Conv3D(64,kernel_size = (5,5,1),padding='same'))\nmodel.add(Activation('relu'))\nmodel.add(MaxPool3D((1,2,2)))\n\nmodel.add(Conv3D(96,kernel_size = (3,3,1),padding='same'))\nmodel.add(Activation('relu'))\nmodel.add(MaxPool3D((1,2,2)))\n\n\nmodel.add(TimeDistributed(Flatten()))\nmodel.add(GRU(256,recurrent_initializer='orthogonal',return_sequences=True))\nmodel.add(Dropout(0.5))\n\nmodel.add(GRU(256,recurrent_initializer='orthogonal',return_sequences=True))\nmodel.add(Dropout(0.5))\n\nmodel.add(Dense(characternum.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))\n\nmodel.summary()\n"

In [15]:
fr ,r =data.as_numpy_iterator().next()
print(fr.shape)
y = model.predict(fr)
y.shape

(2, 75, 46, 140, 1)


2023-02-18 02:50:25.124814: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


(2, 75, 41)

In [16]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss=CTCLoss)

In [17]:
filepath = "saved-model-v3-{epoch:02d}.hdf5"
checkpoint_callback = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=False, mode='min')

#checkpoint_callback = ModelCheckpoint(filepath ='my_best_model.hdf5', monitor='loss',save_best_only=False, mode=’min’) 

In [28]:

# this code decodes the predictions for the model. it takes the batch input
# it decodes the ctc results and then using a number to character function
# takes the number and decodes it to its character and combines all of the 
# results for the same output
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(numcharacter(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text

In [17]:
fr ,r =data.as_numpy_iterator().next()

In [20]:
a = fr
output = model.predict(a)


array([[[3.92715505e-04, 7.06533086e-04, 1.54368937e-01, ...,
         2.32615207e-07, 1.55888556e-04, 2.01194052e-04],
        [1.15997891e-05, 5.15149091e-04, 1.32713700e-04, ...,
         6.10261119e-10, 1.64611185e-06, 3.40751831e-05],
        [1.51826850e-06, 9.92895305e-01, 1.06430662e-05, ...,
         1.36125311e-09, 1.27989244e-06, 1.12719827e-04],
        ...,
        [9.98312473e-01, 2.01988737e-08, 8.01099560e-08, ...,
         3.29071874e-11, 1.67962195e-08, 1.68275111e-03],
        [3.36059695e-03, 6.39720099e-08, 1.63123937e-09, ...,
         1.48678999e-14, 5.75921311e-10, 9.96638298e-01],
        [9.98790205e-01, 1.34166243e-08, 1.05079216e-07, ...,
         2.63463071e-11, 2.43913156e-08, 1.20611303e-03]],

       [[1.39042968e-04, 1.45364625e-04, 5.68868266e-03, ...,
         1.09336282e-08, 1.35872624e-05, 1.79015144e-04],
        [2.82467790e-05, 5.76017017e-04, 2.40705976e-05, ...,
         5.82730086e-10, 1.19791548e-06, 3.44191387e-04],
        [3.62313108e-06, 

## below takes two examples and runs them through the model to see results and decodes the results.

## we can see that it does a pretty good job at getting the correct words. It does have some errors. But overall does a good job

In [35]:
print('real output:',tf.strings.reduce_join(numcharacter(r[0])).numpy().decode("utf-8"))
print('predicted output',decode_batch_predictions(output)[0])

print('second video')
print('real output:',tf.strings.reduce_join(numcharacter(r[1])).numpy().decode("utf-8"))
print('predicted output',decode_batch_predictions(output)[1])

real output: lay red in q six again
predicted output lay red in x six again
second video
real output: lay red with z zero soon
predicted output lay red with z zero soon
