## Installing and Importing Dependencies

In [2]:
!pip install opencv-python matplotlib imageio gdown tensorflow

# opencv - preprocessing
# matplotlib - rendering the results (seeing outputs of preprocessed videos)
# imageio - to see other frames stacked together
# gdown - downloading dataset
# tensorflow - building neural network

Collecting opencv-python
  Downloading opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-24.12.23-py2.py3-none-any.whl.metadata (876 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Coll

In [4]:
!pip list

Package                           Version
--------------------------------- ------------------
absl-py                           2.1.0
aext-assistant                    4.0.15
aext-assistant-server             4.0.15
aext-core                         4.0.15
aext-core-server                  4.0.15
aext-panels                       4.0.15
aext-panels-server                4.0.15
aext-share-notebook               4.0.15
aext-share-notebook-server        4.0.15
aext-shared                       4.0.15
aiobotocore                       2.12.3
aiohappyeyeballs                  2.4.0
aiohttp                           3.10.5
aioitertools                      0.7.1
aiosignal                         1.2.0
alabaster                         0.7.16
altair                            5.0.1
anaconda-anon-usage               0.4.4
anaconda-catalogs                 0.2.0
anaconda-client                   1.12.3
anaconda-cloud-auth               0.5.1
anaconda-navigator                2.6.3
anaconda-pro

In [6]:
import os # navigating through file systems in different operating systems
import cv2 # preprocessing and loading videos
import tensorflow as tf # modelling (tf.data - good data pipeline)
import numpy as np 
from typing import List 
import matplotlib.pyplot as plt
import imageio # array to a gif for preprocess

In [16]:
tf.config.list_physical_devices('GPU')

[]

In [26]:
# Preventing exponential memory growth
physical_device = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    pass

## Building Data Loading Functions

1. to load up videos
2. preprocess annotations (sentences and video lyrics)

In [22]:
import gdown

In [28]:
url = 'https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8Jwjl'
output = 'data.zip'
gdown.download(url, output, quiet=False)
gdown.extractall('data.zip')

FileURLRetrievalError: Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8Jwjl

but Gdown can't. Please check connections and permissions.

In [30]:
# Data loading function

# take the data path and put a list of float represents a video
def load_video(path:str) -> List[float]:

    # creating cv2 instances and then looping through each of them and storing in array called frames
    cap = cv2.VideoCapture(path)
    frames = []
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret, frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        frames.append(frame[190:236, 80:220, :])  # isolating mouth position
    cap.release()

    # scaling particular image features + 
    mean = tf.math.reduce_mean(frames)
    std = tf.math_reduce_std(tf.cast(frames, tf.float32))
    return tf.cast((frames - mean), tf.float32) / std

In [32]:
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]

In [52]:
# Converting nums to chars and vice versa (tokenizing each char)

# https://keras.io/examples/audio/ctc_asr/

# here i'm able to pass through this data to our loss function to calculate our overall loss bcs model gonna return one-hot-encoded version of this

char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True)

In [54]:
char_to_num(['a', 'l', 'm', 'a'])

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([ 1, 12, 13,  1], dtype=int64)>

In [56]:
char_to_num(['2', '0', '1', '4'])

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([31,  0, 30, 33], dtype=int64)>

In [60]:
# Function load to alignments

def load_alignments(path:str) -> List[str]:

    # reading lines from a path into a list
    with open(path, 'r') as f:
        lines = f.readlines()

    # splitting up lines
    tokens = []
    for line in lines:
        line = line.split()
        if line[2] != 'sil':  # line converts silence, we'll ignore it

            # appending them to a list named 'tokens'
            tokens = [*tokens, ' ', line[2]]
    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8')))

In [62]:
21.55

21.55