In [1]:
from typing import List

import cv2
import gdown
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, Lambda

#import tensorflow as tf

In [3]:
# !jupyter nbconvert lip_keypoint_crossentropy.ipynb --to python

[NbConvertApp] Converting notebook lip_keypoint_crossentropy.ipynb to python
[NbConvertApp] Writing 9858 bytes to lip_keypoint_crossentropy.py


In [2]:
df3 = pd.read_csv("face_landmarks_data3.csv", index_col=0)
df4 = pd.read_csv("face_landmarks_data4.csv", index_col=0)
df5 = pd.read_csv("face_landmarks_data5.csv", index_col=0)
df6 = pd.read_csv("face_landmarks_data6.csv", index_col=0)
df = pd.concat([df3, df4, df5, df6], ignore_index=True)

In [3]:
video_path_counts = df['video_path'].value_counts()

video_paths_to_keep = video_path_counts[video_path_counts == 3000].index

df = df[df['video_path'].isin(video_paths_to_keep)]

In [4]:
df

Unnamed: 0,frame,x,y,z,visibility,video_path
0,1,0.528295,0.712854,-0.031539,0.0,data3/s3\bbaf1s.mpg
1,1,0.543726,0.710494,-0.030296,0.0,data3/s3\bbaf1s.mpg
2,1,0.559562,0.715571,-0.022568,0.0,data3/s3\bbaf1s.mpg
3,1,0.570321,0.721982,-0.012750,0.0,data3/s3\bbaf1s.mpg
4,1,0.528245,0.728252,-0.018017,0.0,data3/s3\bbaf1s.mpg
...,...,...,...,...,...,...
9008875,75,0.511908,0.742837,-0.020150,0.0,data6/s6\bbal1n.mpg
9008876,75,0.489571,0.741860,-0.010639,0.0,data6/s6\bbal1n.mpg
9008877,75,0.482818,0.756543,-0.012075,0.0,data6/s6\bbal1n.mpg
9008878,75,0.482979,0.741705,-0.004377,0.0,data6/s6\bbal1n.mpg


In [5]:
mean_x = np.mean(df['x'])
std_x = np.std(df['x'])
mean_y = np.mean(df['y'])
std_y = np.std(df['y'])
mean_z = np.mean(df['z'])
std_z = np.std(df['z'])

df['x'] = (df['x'] - mean_x) / std_x
df['y'] = (df['y'] - mean_y) / std_y
df['z'] = (df['z'] - mean_z) / std_z

In [6]:
# df = df[:4000]

In [7]:
df.head(5)

Unnamed: 0,frame,x,y,z,visibility,video_path
0,1,0.427591,-0.291086,-1.585347,0.0,data3/s3\bbaf1s.mpg
1,1,0.795735,-0.335998,-1.484981,0.0,data3/s3\bbaf1s.mpg
2,1,1.173533,-0.239353,-0.861128,0.0,data3/s3\bbaf1s.mpg
3,1,1.430211,-0.117326,-0.068493,0.0,data3/s3\bbaf1s.mpg
4,1,0.4264,0.002025,-0.493689,0.0,data3/s3\bbaf1s.mpg


In [8]:
df.groupby('video_path').count()

Unnamed: 0_level_0,frame,x,y,z,visibility
video_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
data3/s3\bbaf1s.mpg,3000,3000,3000,3000,3000
data3/s3\bbaf2p.mpg,3000,3000,3000,3000,3000
data3/s3\bbaf3a.mpg,3000,3000,3000,3000,3000
data3/s3\bbafzn.mpg,3000,3000,3000,3000,3000
data3/s3\bbal4n.mpg,3000,3000,3000,3000,3000
...,...,...,...,...,...
data6/s6\swwpza.mpg,3000,3000,3000,3000,3000
data6/s6\swwv1n.mpg,3000,3000,3000,3000,3000
data6/s6\swwv2s.mpg,3000,3000,3000,3000,3000
data6/s6\swwv3p.mpg,3000,3000,3000,3000,3000


In [5]:
df_filtered = df.copy()
chunks = []
num_frames_per_chunk = 3000
for video_path, group in df_filtered.groupby('video_path'):
    num_frames = len(group)
    num_chunks = num_frames // num_frames_per_chunk
    for i in range(num_chunks):
        chunk = group.iloc[i*num_frames_per_chunk:(i+1)*num_frames_per_chunk]
        chunk_reshaped = chunk[['x', 'y', 'z']].values.reshape(-1, 75, 40*3)
        chunks.append(chunk_reshaped)

input_data = np.concatenate(chunks, axis=0)

In [None]:
input_data[]

In [8]:
input_data[0].shape

(75, 120)

In [9]:
def create_vocab():
    vocab = "abcdefghijklmnopqrstuvwxyz123456789 "
    return vocab


def char_to_int(char):
    # shift 1 
    vocab = "abcdefghijklmnopqrstuvwxyz123456789 "
    return vocab.index(char) + 1 if char in vocab else -1

def int_to_char(index):
    # shift 1 
    vocab = "abcdefghijklmnopqrstuvwxyz123456789 "
    return vocab[index - 1] if 1 <= index <= len(vocab) else ''

def load_alignments(path:str) -> List[str]:
    with open(path, 'r') as f:
        lines = f.readlines()
    tokens = []
    for line in lines:
        line = line.split()
        if line[2] != 'sil':
            tokens.extend([*line[2]])
            tokens.append(' ')
    return [char_to_int(token) for token in tokens]

all_alignments = []
for video_path in df['video_path'].unique():
    datapath = video_path.split('/')[0]
    speaker_path = video_path.split('/')[-1].split('\\')[0]
    vid_path = video_path.split('/')[-1].split('\\')[-1].split('.')[0]
    
    alignment_path = os.path.join(f'{datapath}','align',f'{vid_path}.align')
    alignments = load_alignments(alignment_path) 
    all_alignments.append(alignments)


In [10]:
len(all_alignments)

2982

In [11]:
# video_path = df['video_path'].iloc[0]
# datapath = video_path.split('/')
# speakert_path = video_path.split('/')[-1].split('\\')[0]
# vid_path = video_path.split('/')[-1].split('\\')[-1].split('.')[0]

In [12]:

label_data = [np.array(label) for label in all_alignments if label]
print("Input data shape:", input_data.shape)
print("Number of label sequences:", len(label_data))



Input data shape: (2982, 75, 120)
Number of label sequences: 2982


In [13]:
class LipReadingDataset(Dataset):
    def __init__(self, input_features, labels):
        """
        Args:
            input_features (numpy array): Input features with shape (984, 1, 75, 80)
            labels (numpy array): Labels with shape (984, 30)
        """
        self.input_features = input_features
        self.labels = labels

    def __len__(self):
        return len(self.input_features)

    def __getitem__(self, idx):
        # Convert numpy arrays to torch tensors
        video_frames = torch.from_numpy(self.input_features[idx]).float() 
        character_labels = torch.from_numpy(self.labels[idx]).long()
        return video_frames, character_labels

In [14]:
input_data.shape

(2982, 75, 120)

In [15]:
from torch.utils.data import random_split

def collate_fn(batch):
    inputs, targets = zip(*batch)

    inputs_tensor = [torch.tensor(input).float() for input in inputs]
    targets_tensor = [torch.tensor(target).long() for target in targets]

    # max len
    input_lengths = torch.tensor([len(input) for input in inputs_tensor], dtype=torch.long)
    target_lengths = torch.tensor([len(target) for target in targets_tensor], dtype=torch.long)
    max_length = max(max([len(input) for input in inputs_tensor]), 75)
    # padding
    inputs_padded = pad_sequence(inputs_tensor, batch_first=True, padding_value=0) 
    targets_padded = pad_sequence([torch.cat([target, torch.tensor([-1] * (max_length - len(target)))] if len(target) < max_length else target) for target in targets_tensor], batch_first=True, padding_value=-1)

    return inputs_padded, targets_padded, input_lengths, target_lengths

dataset = LipReadingDataset(input_data, label_data)

total_size = len(dataset)
train_size = int(0.8 * total_size)
test_size = total_size - train_size


train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [18]:
next(iter(train_loader))

  inputs_tensor = [torch.tensor(input).float() for input in inputs]
  targets_tensor = [torch.tensor(target).long() for target in targets]


(tensor([[[ 4.6673e-01,  7.1324e-01, -3.3092e-02,  ...,  5.1191e-01,
            7.4153e-01,  1.8019e-03],
          [ 4.6870e-01,  7.1356e-01, -3.3321e-02,  ...,  5.1306e-01,
            7.4056e-01,  1.2250e-03],
          [ 4.6821e-01,  7.1345e-01, -3.3636e-02,  ...,  5.1286e-01,
            7.4119e-01,  5.9725e-04],
          ...,
          [ 4.7122e-01,  7.0895e-01, -3.4431e-02,  ...,  5.1533e-01,
            7.3546e-01, -1.8254e-03],
          [ 4.7153e-01,  7.0848e-01, -3.4342e-02,  ...,  5.1584e-01,
            7.3475e-01, -2.1324e-03],
          [ 4.7228e-01,  7.0780e-01, -3.4673e-02,  ...,  5.1635e-01,
            7.3460e-01, -2.0181e-03]],
 
         [[ 4.6164e-01,  7.1405e-01, -3.3575e-02,  ...,  5.0821e-01,
            7.3560e-01, -2.6842e-03],
          [ 4.6331e-01,  7.1494e-01, -3.3346e-02,  ...,  5.1006e-01,
            7.3519e-01, -1.1624e-03],
          [ 4.6300e-01,  7.1422e-01, -3.3552e-02,  ...,  5.0860e-01,
            7.3693e-01, -1.5794e-03],
          ...,
    

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class LipReadingModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, num_lstm_layers=2, dropout_rate=0.2):
        super(LipReadingModel, self).__init__()
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=num_lstm_layers, batch_first=True, dropout=dropout_rate)
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        x = self.input_layer(x)
        x, _ = self.lstm(x)
        x = self.output_layer(x)
        return x  # Shape: (batch_size, seq_len, num_classes)


vocab = create_vocab()
vocab_len = len(vocab)
num_classes = vocab_len + 1 

In [21]:
vocab

'abcdefghijklmnopqrstuvwxyz123456789 '

In [22]:
vocab_len

36

In [23]:
num_classes

37

In [24]:
input_data.shape

(2982, 75, 120)

In [25]:
input_data[0].shape

(75, 120)

In [26]:
input_data[0]

array([[ 5.28294802e-01,  7.12853551e-01, -3.15389112e-02, ...,
         5.76892495e-01,  7.35325933e-01,  2.36975471e-03],
       [ 5.29577672e-01,  7.11253285e-01, -3.28786299e-02, ...,
         5.78541338e-01,  7.36495733e-01,  1.19938678e-03],
       [ 5.29968917e-01,  7.12010741e-01, -3.30490768e-02, ...,
         5.78672171e-01,  7.37926543e-01,  1.21604651e-03],
       ...,
       [ 5.26558101e-01,  7.04166114e-01, -3.39030139e-02, ...,
         5.75963020e-01,  7.31105328e-01, -1.15881430e-03],
       [ 5.26525080e-01,  7.05153167e-01, -3.42276581e-02, ...,
         5.76362848e-01,  7.32087970e-01, -1.06557133e-03],
       [ 5.26649058e-01,  7.05285549e-01, -3.37997042e-02, ...,
         5.76122642e-01,  7.32028484e-01, -3.55579919e-04]])

In [22]:
import torch
import random
import numpy as np
from torch.utils.data import DataLoader, Subset


def train_model(model, train_loader, test_loader, criterion, optimizer, device, num_epochs=100):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, targets, input_lengths, target_lengths in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.transpose(1, 2)
            
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for inputs, targets, input_lengths, target_lengths in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                outputs = outputs.transpose(1, 2)
                val_loss = criterion(outputs, targets)
                total_val_loss += val_loss.item()
        avg_val_loss = total_val_loss / len(test_loader)

        print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = LipReadingModel(input_dim=120, hidden_dim=256, num_classes=num_classes, num_lstm_layers=2, dropout_rate=0.5).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


train_model(model, train_loader, test_loader, criterion, optimizer, device, num_epochs=100)

  inputs_tensor = [torch.tensor(input).float() for input in inputs]
  targets_tensor = [torch.tensor(target).long() for target in targets]


Epoch 1, Training Loss: 2.677615716457367, Validation Loss: 2.477437763214111
Epoch 2, Training Loss: 2.343436715602875, Validation Loss: 2.2327731800079347
Epoch 3, Training Loss: 2.195141487121582, Validation Loss: 2.14636435508728
Epoch 4, Training Loss: 2.1290117168426512, Validation Loss: 2.0877509117126465
Epoch 5, Training Loss: 2.095003356933594, Validation Loss: 2.073266191482544
Epoch 6, Training Loss: 2.078658764362335, Validation Loss: 2.0635959339141845
Epoch 7, Training Loss: 2.072378553152084, Validation Loss: 2.063204736709595
Epoch 8, Training Loss: 2.068823471069336, Validation Loss: 2.055733962059021
Epoch 9, Training Loss: 2.0569102287292482, Validation Loss: 2.0418472051620484
Epoch 10, Training Loss: 2.0500200533866884, Validation Loss: 2.0361515426635743
Epoch 11, Training Loss: 2.0456985306739806, Validation Loss: 2.045603322982788
Epoch 12, Training Loss: 2.0434894037246703, Validation Loss: 2.0324061059951783
Epoch 13, Training Loss: 2.0402947556972504, Valida

KeyboardInterrupt: 

In [23]:
model.eval()

LipReadingModel(
  (input_layer): Sequential(
    (0): Linear(in_features=120, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
  )
  (lstm): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.5)
  (output_layer): Sequential(
    (0): Linear(in_features=256, out_features=37, bias=True)
  )
)

In [36]:
model.eval()

with torch.no_grad():
    inputs_padded, targets_padded, input_lengths, target_lengths = next(iter(test_loader))

    inputs_padded = inputs_padded.to(device)
    
    outputs = model(inputs_padded)
    outputs = outputs.transpose(1, 2)

    predicted_indices = outputs.argmax(dim=1)
    for i in range(31):
        print("original:", ''.join(int_to_char(idx) for idx in targets_padded[i]))
        print("".join([int_to_char(index) for index in predicted_indices[i][:35]]))

  inputs_tensor = [torch.tensor(input).float() for input in inputs]
  targets_tensor = [torch.tensor(target).long() for target in targets]


original: lay green with m four again 
bin blue  t   nie  again     e e   
original: place green by y two now 
pay white wn n nix nown        e   
original: lay green by f six please 
let whiee t    eie   o             
original: lay blue in x one soon 
bin blue it    eeee ooon     e  e  
original: set red in t four soon 
bin e ien  n s sen  oow  i         
original: set blue in m four now 
pla e den ty   ire soon e   e      
original: lay blue in p nine please 
plt blue by h thre  noa e          
original: lay green in z zero please 
pey bed   t    eee again           
original: bin green with u three soon 
place blue  in j znre noo          
original: lay red with r five please 
lay r den t    t nep please        
original: bin red at s five again 
sey g ien  n z tou  aaan           
original: bin white in g zero again 
lly r den ii   teo ngoin           
original: bin white by g seven again 
bin glue by h t ne oon     aee     
original: bin white in s five now 
bin e den it   noe so