In [1]:
import os
import cv2
import torch
import gdown
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.functional import ctc_loss
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, Lambda
import torch.optim as optim
from typing import List
from tqdm import tqdm

In [2]:
from utils import return_vocab_size, char_to_num, num_to_char


In [3]:
df = pd.read_csv("face_landmarks.csv", index_col=0)

In [4]:
vocab_len = len(list("abcdefghijklmnopqrstuvwxyz'?!123456789 "))
video_path_counts = df['video_path'].value_counts()

video_paths_to_keep = video_path_counts[video_path_counts == 3000].index

df = df[df['video_path'].isin(video_paths_to_keep)]

In [5]:
df.groupby('video_path').count()

Unnamed: 0_level_0,frame,x,y,z,visibility
video_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
data/s1\bbaf2n.mpg,3000,3000,3000,3000,3000
data/s1\bbaf3s.mpg,3000,3000,3000,3000,3000
data/s1\bbaf4p.mpg,3000,3000,3000,3000,3000
data/s1\bbaf5a.mpg,3000,3000,3000,3000,3000
data/s1\bbal6n.mpg,3000,3000,3000,3000,3000
...,...,...,...,...,...
data/s1\swwp5a.mpg,3000,3000,3000,3000,3000
data/s1\swwv6n.mpg,3000,3000,3000,3000,3000
data/s1\swwv7s.mpg,3000,3000,3000,3000,3000
data/s1\swwv8p.mpg,3000,3000,3000,3000,3000


In [6]:
df_filtered = df.copy()
chunks = []
num_frames_per_chunk = 3000
for video_path, group in df_filtered.groupby('video_path'):
    num_frames = len(group)
    num_chunks = num_frames // num_frames_per_chunk
    for i in range(num_chunks):
        chunk = group.iloc[i*num_frames_per_chunk:(i+1)*num_frames_per_chunk]
        chunk_reshaped = chunk[['x', 'y']].values.reshape(-1, 75, 40*2)
        chunks.append(chunk_reshaped)

input_data = np.concatenate(chunks, axis=0)

In [7]:
input_data.shape

(984, 75, 80)

In [8]:
import os
from typing import List
import tensorflow as tf
import pandas as pd

def char_to_num(char):
    vocab = "abcdefghijklmnopqrstuvwxyz'?!123456789 "
    return vocab.index(char) if char in vocab else -1

def load_alignments(path:str) -> List[str]:
    with open(path, 'r') as f:
        lines = f.readlines()
    tokens = []
    for line in lines:
        line = line.split()
        if line[2] != 'sil':
            tokens.extend([*line[2]])
    return [char_to_num(token) for token in tokens]

all_alignments = []
for video_path in df['video_path'].unique():
    video_path = video_path.split('/')[-1].split('\\')[-1].split('.')[0]
    
    alignment_path = os.path.join('data','alignments','s1',f'{video_path}.align')
    alignments = load_alignments(alignment_path) 
    print(alignments)
    all_alignments.append(alignments)


[1, 8, 13, 1, 11, 20, 4, 0, 19, 5, 19, 22, 14, 13, 14, 22]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 5, 19, 7, 17, 4, 4, 18, 14, 14, 13]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 5, 5, 14, 20, 17, 15, 11, 4, 0, 18, 4]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 5, 5, 8, 21, 4, 0, 6, 0, 8, 13]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 11, 18, 8, 23, 13, 14, 22]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 11, 18, 4, 21, 4, 13, 18, 14, 14, 13]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 11, 4, 8, 6, 7, 19, 15, 11, 4, 0, 18, 4]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 11, 13, 8, 13, 4, 0, 6, 0, 8, 13]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 18, 14, 13, 4, 18, 14, 14, 13]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 18, 19, 22, 14, 15, 11, 4, 0, 18, 4]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 18, 19, 7, 17, 4, 4, 0, 6, 0, 8, 13]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 18, 25, 4, 17, 14, 13, 14, 22]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 25, 5, 14, 20, 17, 13, 14, 22]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 25, 5, 8, 21, 4, 18, 14, 14, 13]
[1, 8, 13, 1, 11, 20, 4, 0, 19, 25, 18, 8, 23, 15, 11, 4, 0, 18, 4]
[1, 8, 13,

In [9]:
len(all_alignments)

984

In [10]:
video_path.split('/')[-1].split('\\')[-1].split('.')[0]

'swwv9a'

In [12]:
import numpy as np

max_seq_length = 30  

def pad_sequences(sequences, maxlen, value=-1):
    # Pad each sequence to the same length
    return [seq + [value] * (maxlen - len(seq)) if len(seq) < maxlen else seq[:maxlen] for seq in sequences]

padded_alignments = pad_sequences(all_alignments, max_seq_length)

# Convert to a numpy array for use in TensorFlow
label_data = np.array(padded_alignments)

# Example of shaping your input data and label data for model training
input_data = np.stack(chunks)  # Ensure that chunks are correctly reshaped and stacked

# Check shapes (important for debugging)
print("Input data shape:", input_data.shape)
print("Label data shape:", label_data.shape)



Input data shape: (984, 1, 75, 80)
Label data shape: (984, 30)


In [13]:
class LipReadingDataset(Dataset):
    def __init__(self, input_features, labels):
        """
        Args:
            input_features (numpy array): Input features with shape (984, 1, 75, 80)
            labels (numpy array): Labels with shape (984, 30)
        """
        self.input_features = input_features
        self.labels = labels

    def __len__(self):
        return len(self.input_features)

    def __getitem__(self, idx):
        # Convert numpy arrays to torch tensors
        video_frames = torch.from_numpy(self.input_features[idx]).float()  # Ensure dtype is float for input
        character_labels = torch.from_numpy(self.labels[idx]).long()  # Ensure dtype is long for labels
        
        return video_frames, character_labels

In [14]:
dataset = LipReadingDataset(input_data, label_data)

In [15]:
data_loader = DataLoader(dataset, batch_size=10, shuffle=True)

for video_frames, character_labels in data_loader:
    print(video_frames.shape, character_labels.shape)

torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 75, 80]) torch.Size([10, 30])
torch.Size([10, 1, 7

In [76]:
import torch
import torch.nn as nn
import torch.optim as optim

class LipReadingModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, num_layers=2):
        super(LipReadingModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        print(x.shape)
        x, _ = self.lstm(x)  # LSTM output shape: (batch, seq_len, hidden_dim)
        x = self.fc(x)  # Shape after FC: (batch, seq_len, num_classes)
        x = x.log_softmax(dim=2)  # Apply log softmax at the correct dimension
        x = x.transpose(0, 1)  # CTC expects: (seq_len, batch, num_classes)
        print(x.shape)
        return x

vocab = "abcdefghijklmnopqrstuvwxyz'?!123456789 "
vocab_len = len(vocab)  
num_classes = vocab_len + 1 

model = LipReadingModel(input_dim=80, hidden_dim=64, num_classes=num_classes)
criterion = nn.CTCLoss(blank=38, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [57]:
import torch

def train_model(model, data_loader, criterion, optimizer, device, num_epochs=10):
    model.to(device)
    model.train()

    for epoch in range(num_epochs):
        for inputs, targets in data_loader:
            # Ensure inputs and targets are on the same device as the model
            inputs = inputs.to(device).squeeze()  # Assuming inputs is a batch of videos, remove unnecessary dimensions if any
            targets = targets.to(device)  # Move targets to the same device

            input_lengths = torch.full((inputs.size(0),), inputs.size(1), dtype=torch.long, device=device)
            target_lengths = torch.tensor([len(t) for t in targets], dtype=torch.long, device=device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets, input_lengths, target_lengths)

            loss.backward()
            # Optional: Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
            optimizer.step()

            print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Example of using the function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, data_loader, criterion, optimizer, device, num_epochs=2)


Epoch 1, Loss: 7.362927436828613
Epoch 1, Loss: 7.241091251373291
Epoch 1, Loss: 7.2109832763671875
Epoch 1, Loss: 7.142787456512451
Epoch 1, Loss: 7.132290840148926
Epoch 1, Loss: 7.046170234680176
Epoch 1, Loss: 7.013072967529297
Epoch 1, Loss: 6.93849515914917
Epoch 1, Loss: 6.880620002746582
Epoch 1, Loss: 6.803167819976807
Epoch 1, Loss: 6.729270935058594
Epoch 1, Loss: 6.612252712249756
Epoch 1, Loss: 6.4839043617248535
Epoch 1, Loss: 6.346290588378906
Epoch 1, Loss: 6.206050395965576
Epoch 1, Loss: 6.0354204177856445
Epoch 1, Loss: 5.8424577713012695
Epoch 1, Loss: 5.648653984069824
Epoch 1, Loss: 5.415228366851807
Epoch 1, Loss: 5.186831474304199
Epoch 1, Loss: 5.036067962646484
Epoch 1, Loss: 4.811801433563232
Epoch 1, Loss: 4.609724044799805
Epoch 1, Loss: 4.402998924255371
Epoch 1, Loss: 4.298404693603516
Epoch 1, Loss: 4.128207683563232
Epoch 1, Loss: 4.099191188812256
Epoch 1, Loss: 3.9635777473449707
Epoch 1, Loss: 3.8655974864959717
Epoch 1, Loss: 3.793588399887085
Epoch

In [77]:
model.eval()


LipReadingModel(
  (lstm): LSTM(80, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=40, bias=True)
)

In [78]:
first_batch = next(iter(data_loader))
first_input, labels = first_batch 

first_input_sample = first_input[0].unsqueeze(0) 
first_input_sample = first_input_sample.squeeze(1)
print("Shape of the input sample for inference:", first_input_sample.shape)

Shape of the input sample for inference: torch.Size([1, 75, 80])


In [82]:
with torch.no_grad():
    log_probs = model(first_input_sample)


torch.Size([1, 75, 80])
torch.Size([75, 1, 40])


In [75]:
labels[0].shape

torch.Size([30])

In [79]:
probabilities[0].shape

torch.Size([1, 40])

In [54]:
max_indices = [np.argmax(log_prob) for log_prob in probabilities]
max_indices

[tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38),
 tensor(38)]

In [47]:
import torch
import itertools

def greedy_decoder(ctc_output, blank_label=0):
    # softmax
    probabilities = torch.exp(ctc_output)
    # get prob at every time step
    _, max_indices = torch.max(probabilities, 2)
    # remove duplicates
    indices = max_indices.squeeze(1)  # Remove the batch dimension
    decoded_sequence = [index for index, group in itertools.groupby(indices) if index != blank_label]
    return decoded_sequence

decoded_sequence = greedy_decoder(log_probs)
print("Decoded sequence indices:", decoded_sequence)


Decoded sequence indices: [tensor(38)]


#### Data pipeline

In [4]:
class MyVideoDataset(Dataset):
    # TODO: Refactor this whole class
    def __init__(self, file_paths):
        self.file_paths = file_paths
        self.max_frames = 75
        self.max_alignments = 40

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        video_path = self.file_paths[idx]
        file_name = os.path.basename(video_path).split('.')[0]
        alignment_path = os.path.join('data', 'alignments', 's1', f'{file_name}.align')
        
        frames = self.load_video(video_path)
        alignments = self.load_alignments(alignment_path)
        
        return frames, alignments

    def load_video(self, path):
        cap = cv2.VideoCapture(path)
        frames = []
        for _ in range(min(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), self.max_frames)):
            ret, frame = cap.read()
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frame = frame[190:236, 80:220]
            frames.append(frame)
        cap.release()

        frames = np.array(frames, dtype=np.float32)
        mean = np.mean(frames)
        std = np.std(frames)
        frames = (frames - mean) / std

        if len(frames) < self.max_frames:
            frames = np.pad(frames, [(0, self.max_frames - len(frames)), (0, 0), (0, 0)], mode='constant')
        # channel
        frames = np.expand_dims(frames, axis=-1)
        return torch.from_numpy(frames)

    def load_alignments(self, path):
        with open(path, 'r') as f:
            lines = f.readlines()
        tokens = []
        for line in lines:
            line = line.split()
            if line[2] != 'sil':
                tokens.append(line[2])

        tokens = ' '.join(tokens)
        alignment_tensor = torch.tensor([char_to_num(char) for char in tokens], dtype=torch.int64)

        if len(alignment_tensor) < self.max_alignments:
            alignment_tensor = F.pad(alignment_tensor, (0, self.max_alignments - len(alignment_tensor)), value=-1)

        return alignment_tensor

def collate_fn(batch):
    # required for padding
    frames, alignments = zip(*batch)
    
    frames = torch.stack(frames, dim=0)
    alignments = torch.stack(alignments, dim=0)

    return frames, alignments

video_files = [f'data/s1/{file}' for file in os.listdir('data/s1') if file.endswith('.mpg')]

dataset = MyVideoDataset(video_files)
