In [1]:
import gdown

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import cv2
import tensorflow as tf
from typing import List
from matplotlib import pyplot as plt
import imageio
import numpy as np

In [4]:
def CTCLoss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [5]:
model = tf.keras.models.load_model("/content/drive/MyDrive/lipnet01.h5", custom_objects={'CTCLoss': CTCLoss})

In [None]:
model.summary()

In [14]:
mouthxml = '/content/drive/MyDrive/haarcascade_mcs_mouth.xml'
mouth_cascade = cv2.CascadeClassifier(mouthxml)

In [81]:
mouth_cascade.empty()

False

In [107]:
def load_video(path:str) -> List[float]:
  ds_factor = 0.5
  cap = cv2.VideoCapture(path)
  frames = []

  while True:
      ret, frame = cap.read()

      if not ret:
          break

      frame = cv2.resize(frame, None, fx=ds_factor, fy=ds_factor, interpolation=cv2.INTER_AREA)

      gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

      mouth_rects = mouth_cascade.detectMultiScale(frame, 1.7, 11)
      for x, y, w, h in mouth_rects:
          y = int(y - 0.15 * h)
          a = (100-h)/2
          b = (200-w)/2
          mouth_roi = tf.image.rgb_to_grayscale(frame[int(y-a):int(y+h+a), int(x-b):int(x+w+b)])
          frames.append(mouth_roi)

  cap.release()

  # Convert the list of NumPy arrays to a NumPy array
  frames_np = np.array(frames)

  # Calculate mean and standard deviation
  mean = np.mean(frames_np)
  std = np.std(frames_np)

  result = (frames_np - mean) / std

  result = tf.convert_to_tensor(result, dtype=tf.float32)

  return result

In [108]:
result = load_video('/content/drive/MyDrive/test07.mp4')#replace with sample path

In [109]:
result.shape

TensorShape([149, 100, 200, 1])

In [None]:
for x in range(len(result)):
  plt.imshow(result[x])
  plt.show()

In [None]:
plt.imshow(result[120])
plt.show()

In [95]:
result = np.expand_dims(result, axis=0)

In [None]:
# save clip as gif
result_np_list = [frame.numpy() for frame in result]

result_np_list = [np.squeeze(frame, axis=-1) for frame in result_np_list]

imageio.mimsave('/content/drive/MyDrive/example.gif', result_np_list, fps=10)

In [97]:
model.input_shape

(None, None, 100, 200, 1)

In [96]:
result.shape

(1, 149, 100, 200, 1)

In [98]:
pred = model.predict(result)



In [99]:
pred

array([[[2.1164941e-04, 1.9699565e-04, 6.3117141e-01, ...,
         4.7296556e-08, 5.5390684e-04, 8.9820666e-04],
        [6.5767963e-04, 7.6233624e-03, 3.3555835e-02, ...,
         3.6350801e-08, 5.5706041e-04, 3.2721624e-02],
        [1.0199876e-03, 1.0877026e-01, 2.8275617e-03, ...,
         1.8010788e-08, 3.9134407e-03, 4.3451291e-02],
        ...,
        [9.6942329e-01, 5.6036555e-07, 7.1630261e-08, ...,
         1.6275263e-09, 2.8182029e-07, 3.0426813e-02],
        [6.3897215e-02, 1.2776152e-06, 1.5133725e-07, ...,
         2.1523352e-09, 1.0015519e-06, 9.3575621e-01],
        [9.8423886e-01, 1.0250112e-06, 2.2371279e-07, ...,
         4.4527678e-08, 4.1456988e-07, 1.5639573e-02]]], dtype=float32)

In [100]:
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]

In [101]:
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

In [102]:
tf.strings.reduce_join([num_to_char(x) for x in tf.argmax(pred[0],axis=1)])

<tf.Tensor: shape=(), dtype=string, numpy=b'ben    iieeee ainn     iieee   aggnnn'>