#Imports

In [None]:
import os
import tensorflow as tf
import numpy as np
import cv2
import gdown
from matplotlib import pyplot as plt

In [None]:
tf.config.list_physical_devices('GPU')

physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    pass

#Collecting GRID dataset

In [None]:
#input video
url = 'https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL'
output = 'data.zip'
gdown.download(url, output, quiet=False)
gdown.extractall('data.zip')

#Data Preprocessing

In [None]:
#mouth region image parsing

mouthCascade = cv2.CascadeClassifier("/content/mouth.xml")

def load_video(path:str):

    cap = cv2.VideoCapture(path)
    frames = []

    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret, frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        mouth = mouthCascade.detectMultiScale(frame, 1.3, 5)
        x, y, w, h = mouth[0]
        frames.append(frame[x : x + w, y : y + h, :])       # to find the solution when there are more then one mouth in the video
    cap.release()
    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))
    return tf.cast((frames - mean), tf.float32) / std

In [None]:
load_video("/content/bbaf2n.mpg")

In [None]:
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary = vocab, oov_token = "")
num_to_char = tf.keras.layers.StringLookup(
    vocabulary = char_to_num.get_vocabulary(), oov_token = "", invert=True
)

In [None]:
def load_alignments(path:str) -> list[str]:

    with open(path, 'r') as f:
        lines = f.readlines()
    tokens = []
    for line in lines:
        line = line.split()
        if line[2] != 'sil':
            tokens = [*tokens, ' ', line[2]]
    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]

In [None]:
def load_data(path: str) -> tuple():

    file_name = path.split("\\")[-1].split(".")[0]
    video_path = os.path.join('data', 's1', f'{file_name}.mpg')
    alignment_path = os.path.join('data', 'alignments', 's1', f'{file_name}.align')
    frames = load_video(video_path)
    alignments = load_alignments(alignment_path)
    return frames, alignments

In [None]:
def mappable_function(path: str) -> list[str]:

    result = []
    for data in load_data(path):
        if data == load_data(path)[0]:
            result.append(tf.cast(data, tf.float32))
        else:
            result.append(tf.cast(data, tf.int64))
    return tf.stack(result)