# End to End Audio-Visual Speech Recognition With Conformers

In [None]:
%cd "/content/"
!git clone https://github.com/mpc001/Visual_Speech_Recognition_for_Multiple_Languages.git
%cd "Visual_Speech_Recognition_for_Multiple_Languages"

/content
Cloning into 'Visual_Speech_Recognition_for_Multiple_Languages'...
remote: Enumerating objects: 277, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 277 (delta 14), reused 12 (delta 12), pack-reused 251 (from 1)[K
Receiving objects: 100% (277/277), 69.76 MiB | 29.89 MiB/s, done.
Resolving deltas: 100% (69/69), done.
/content/Visual_Speech_Recognition_for_Multiple_Languages


## Import Relevant Libraries

In [None]:
!pip install torch torchvision torchaudio
!pip install opencv-python
!pip install scipy
!pip install scikit-image
!pip install av
!pip install six
!pip install mediapipe
!pip install ffmpeg-python

import os
import random
import ffmpeg
import torch
import torchaudio
import IPython.display as ipd
import os
import torch
import cv2
import torchvision
from pipelines.model import AVSR
from pipelines.data.data_module import AVSRDataLoader
from pipelines.detectors.mediapipe.detector import LandmarksDetector



In [None]:
# Device agnostic code
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cpu


## Utilities

### Load a Sample Video With Clear Audio

In [None]:
!mkdir -p /content/data/
!wget --content-disposition http://www.doc.ic.ac.uk/~pm4115/autoAVSR/autoavsr_demo_video.mp4 -O /content/data/clip.mp4

--2024-11-03 13:52:08--  http://www.doc.ic.ac.uk/~pm4115/autoAVSR/autoavsr_demo_video.mp4
Resolving www.doc.ic.ac.uk (www.doc.ic.ac.uk)... 146.169.13.6
Connecting to www.doc.ic.ac.uk (www.doc.ic.ac.uk)|146.169.13.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3644186 (3.5M) [video/mp4]
Saving to: ‘/content/data/clip.mp4’


2024-11-03 13:52:10 (4.45 MB/s) - ‘/content/data/clip.mp4’ saved [3644186/3644186]



In [None]:
ipd.Video('/content/data/clip.mp4', embed=True, width=400)

### Load a Noisy Clip
* Overlay a segment of this noisy clip on top of sample video's audio file.

In [None]:
!mkdir -p /content/data/
!wget http://www.doc.ic.ac.uk/~pm4115/autoAVSR/babble_noise.wav -O /content/data/babble_noise.wav

--2024-11-03 13:55:03--  http://www.doc.ic.ac.uk/~pm4115/autoAVSR/babble_noise.wav
Resolving www.doc.ic.ac.uk (www.doc.ic.ac.uk)... 146.169.13.6
Connecting to www.doc.ic.ac.uk (www.doc.ic.ac.uk)|146.169.13.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15054806 (14M) [audio/x-wav]
Saving to: ‘/content/data/babble_noise.wav’


2024-11-03 13:55:04 (15.0 MB/s) - ‘/content/data/babble_noise.wav’ saved [15054806/15054806]



In [None]:
def create_noisy_clip(src_filename, dst_filename, noise, snr_level):
    speech, sample_rate = torchaudio.load(src_filename)
    noise, _ = torchaudio.load(noise)
    if sample_rate != _:
        noise = torchaudio.functional.resample(noise, _, sample_rate)
    start_idx = random.randint(0, noise.shape[1] - speech.shape[1])
    noise = noise[:, start_idx:start_idx + speech.shape[1]]
    noisy_speech = torchaudio.functional.add_noise(speech, noise, torch.tensor([snr_level]))
    torchaudio.save(dst_filename[:-4]+".wav", noisy_speech, sample_rate)

    in1 = ffmpeg.input(src_filename)
    in2 = ffmpeg.input(dst_filename[:-4]+".wav")
    out = ffmpeg.output(in1['v'], in2['a'], dst_filename, loglevel="panic")
    out = out.overwrite_output()
    out.run()
    os.remove(f"{dst_filename[:-4]+'.wav'}")
    return

In [None]:
src_filename = "/content/data/clip.mp4"
dst_filename = "/content/data/noisy_clip.mp4"
noise = "/content/data/babble_noise.wav"

# Higher SNR indicates higher ratio of audio power to noise power.
create_noisy_clip(src_filename, dst_filename, noise, snr_level=-5)

In [None]:
ipd.Video('/content/data/noisy_clip.mp4', embed=True, width=400)

## Inference Pipeline for Audio, Visual and Audio-Visual Media

In [None]:
class InferencePipeline(torch.nn.Module):
    def __init__(self, modality, model_path, model_conf, detector="mediapipe", face_track=False, device=device):
        super(InferencePipeline, self).__init__()
        self.device = device
        # modality configuration
        self.modality = modality
        self.dataloader = AVSRDataLoader(modality, detector=detector)
        self.model = AVSR(modality, model_path, model_conf, rnnlm=None, rnnlm_conf=None, penalty=0.0, ctc_weight=0.1, lm_weight=0.0, beam_size=40, device=device)
        if face_track and self.modality in ["video", "audiovisual"]:
            self.landmarks_detector = LandmarksDetector()
        else:
            self.landmarks_detector = None


    def process_landmarks(self, data_filename, landmarks_filename):
        '''
        Process landmarks and return only if media involves visual component.
        '''
        if self.modality == "audio":
            return None
        if self.modality in ["video", "audiovisual"]:
            landmarks = self.landmarks_detector(data_filename)
            return landmarks


    def forward(self, data_filename, landmarks_filename=None):
        '''
        This method takes a filename for input data (audio or video),
        checks if it exists, processes any required landmarks,
        then crops around the mouth region and
        loads the data using the dataloader,
        and then infers a transcript using the model.
        The transcript is returned as output.
        '''
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        transcript = self.model.infer(data)
        return transcript

    def extract_features(self, data_filename, landmarks_filename=None, extract_resnet_feats=False):
        '''
        This method allows for feature extraction from the input data without performing inference.
        It checks for file existence and processes landmarks similarly to the forward method
        but focuses on encoding features using the model's encoder.
        '''
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        with torch.no_grad():
            if isinstance(data, tuple):
                enc_feats = self.model.model.encode(data[0].to(self.device), data[1].to(self.device), extract_resnet_feats)
            else:
                enc_feats = self.model.model.encode(data.to(self.device), extract_resnet_feats)
        return enc_feats

## Inference on Sample Media
* Audio
* Visual
* Audio-Visual

### Inference on Audio Stream

In [None]:
# Load AVSR Model Trained on LRS3 Audio Dataset.
%mkdir -p /content/data/
!wget http://www.doc.ic.ac.uk/~pm4115/autoAVSR/LRS3_A_WER1.0.zip -O /content/data/LRS3_A_WER1.0.zip
!unzip -o /content/data/LRS3_A_WER1.0.zip -d /content/data/

--2024-11-03 14:14:55--  http://www.doc.ic.ac.uk/~pm4115/autoAVSR/LRS3_A_WER1.0.zip
Resolving www.doc.ic.ac.uk (www.doc.ic.ac.uk)... 146.169.13.6
Connecting to www.doc.ic.ac.uk (www.doc.ic.ac.uk)|146.169.13.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 902649180 (861M) [application/zip]
Saving to: ‘/content/data/LRS3_A_WER1.0.zip’


2024-11-03 14:15:20 (35.8 MB/s) - ‘/content/data/LRS3_A_WER1.0.zip’ saved [902649180/902649180]

Archive:  /content/data/LRS3_A_WER1.0.zip
  inflating: /content/data/LRS3_A_WER1.0/model.json  
  inflating: /content/data/LRS3_A_WER1.0/model.pth  


In [None]:
# Instantiate Model Pipeline for Audio Speech Recognition.
modality = "audio"
model_conf = "/content/data/LRS3_A_WER1.0/model.json"
model_path = "/content/data/LRS3_A_WER1.0/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf)

In [None]:
# Perform Inference on the Noisy Audio.
transcript = pipeline("/content/data/noisy_clip.mp4")
print(transcript)

COMPLETELY UNCONSTRAINED ENVIRONMENTS WHERE WE HAVE LARGE CHANGES IN CATHOLES AND


### Inference on Video Stream

In [None]:
# Load AVSR Model Trained on LRS3 Visual Dataset.
%mkdir -p /content/data/
!wget http://www.doc.ic.ac.uk/~pm4115/autoAVSR/LRS3_V_WER19.1.zip -O /content/data/LRS3_V_WER19.1.zip
!unzip -o /content/data/LRS3_V_WER19.1.zip -d /content/data/

--2024-11-03 14:19:55--  http://www.doc.ic.ac.uk/~pm4115/autoAVSR/LRS3_V_WER19.1.zip
Resolving www.doc.ic.ac.uk (www.doc.ic.ac.uk)... 146.169.13.6
Connecting to www.doc.ic.ac.uk (www.doc.ic.ac.uk)|146.169.13.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 937274463 (894M) [application/zip]
Saving to: ‘/content/data/LRS3_V_WER19.1.zip’


2024-11-03 14:20:21 (34.6 MB/s) - ‘/content/data/LRS3_V_WER19.1.zip’ saved [937274463/937274463]

Archive:  /content/data/LRS3_V_WER19.1.zip
  inflating: /content/data/LRS3_V_WER19.1/model.json  
  inflating: /content/data/LRS3_V_WER19.1/model.pth  


In [None]:
# Instantiate Model Pipeline for Visual Speech Recognition.
modality = "video"
model_conf = "/content/data/LRS3_V_WER19.1/model.json"
model_path = "/content/data/LRS3_V_WER19.1/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True)

  self.model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))


In [None]:
# Perform Inference on Video Stream Alone.
transcript = pipeline("/content/data/noisy_clip.mp4")
print(transcript)

COMPLETELY CONCENTRATED ENVIRONMENTS WHERE WE HAVE LARGE CHANGES IN GET POSTS AND


### Inference on Audio-Visual Stream

In [None]:
# Load AVSR Model Trained on LRS3 Audio-Visual Dataset.
%mkdir -p /content/data/
!wget http://www.doc.ic.ac.uk/~pm4115/autoAVSR/LRS3_AV_WER0.9.zip -O /content/data/LRS3_AV_WER0.9.zip
!unzip -o /content/data/LRS3_AV_WER0.9.zip -d /content/data/

--2024-11-03 14:22:13--  http://www.doc.ic.ac.uk/~pm4115/autoAVSR/LRS3_AV_WER0.9.zip
Resolving www.doc.ic.ac.uk (www.doc.ic.ac.uk)... 146.169.13.6
Connecting to www.doc.ic.ac.uk (www.doc.ic.ac.uk)|146.169.13.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1655043546 (1.5G) [application/zip]
Saving to: ‘/content/data/LRS3_AV_WER0.9.zip’


2024-11-03 14:22:56 (37.1 MB/s) - ‘/content/data/LRS3_AV_WER0.9.zip’ saved [1655043546/1655043546]

Archive:  /content/data/LRS3_AV_WER0.9.zip
  inflating: /content/data/LRS3_AV_WER0.9/model.json  
  inflating: /content/data/LRS3_AV_WER0.9/model.pth  


In [None]:
# Instantiate Model Pipeline for Audio-Visual Speech Recognition.
modality = "audiovisual"
model_conf = "/content/data/LRS3_AV_WER0.9/model.json"
model_path = "/content/data/LRS3_AV_WER0.9/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True)

In [None]:
# Perform Inference on Audio and Video Stream.
transcript = pipeline("/content/data/noisy_clip.mp4")
print(transcript)

COMPLETELY UNCONSTRAINED ENVIRONMENTS WHERE WE HAVE LARGE CHANGES IN GET POLES


## Feature Extraction from Conformers.

### Visual Features.

In [None]:
modality = "video"
model_conf = "/content/data/LRS3_V_WER19.1/model.json"
model_path = "/content/data/LRS3_V_WER19.1/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True)

  self.model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))


In [None]:
# [Option 1]. Extract features from the output of Conformer.
features1 = pipeline.extract_features("/content/data/clip.mp4")
print(features1.size())

# [Option 2]. Extract features from the output of ResNet.
features2 = pipeline.extract_features("/content/data/clip.mp4", extract_resnet_feats=True)
print(features2.size())

torch.Size([178, 768])
torch.Size([178, 512])


## Audio Features

In [None]:
modality = "audio"
model_conf = "/content/data/LRS3_A_WER1.0/model.json"
model_path = "/content/data/LRS3_A_WER1.0/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf)

In [None]:
# [Option 1]. Extract features from the output of Conformer.
features3 = pipeline.extract_features("/content/data/clip.mp4")
print(features3.size())

# [Option 2]. Extract features from the output of ResNet.
features4 = pipeline.extract_features("/content/data/clip.mp4", extract_resnet_feats=True)
print(features4.size())

torch.Size([185, 768])
torch.Size([185, 512])


## Audio-Visual Features

In [None]:
modality = "audiovisual"
model_conf = "/content/data/LRS3_AV_WER0.9/model.json"
model_path = "/content/data/LRS3_AV_WER0.9/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True)

In [None]:
# [Option 1]. Extract features from the output of Conformer.
features5 = pipeline.extract_features("/content/data/clip.mp4")
print(features5.size())

# [Option 2]. Extract features from the output of ResNet.
features6 = pipeline.extract_features("/content/data/clip.mp4", extract_resnet_feats=True)
print(features6.size())

torch.Size([178, 768])
torch.Size([178, 512])
