# Collab preliminaries

In [1]:
# This code utilizes the NVIDIA CUDA Compiler (nvcc) to compile CUDA programs
# for parallel computing on NVIDIA GPUs. Make sure you have the CUDA Toolkit
# installed in your environment to use CUDA-related functionalities.
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Mon_Apr__3_17:16:06_PDT_2023
Cuda compilation tools, release 12.1, V12.1.105
Build cuda_12.1.r12.1/compiler.32688072_0


In [2]:
!cp -ri "/kaggle/input/wave2lip-utils" /kaggle/working/

In [None]:
# # Mounts Google Drive to access files and directories stored on your Google Drive
# from google.colab import drive
# drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Get the code and models

In [3]:
# Clones the Wav2Lip GitHub repository to access the code and resources for running the Wav2Lip model
!git clone https://github.com/Rudrabha/Wav2Lip.git

Cloning into 'Wav2Lip'...
remote: Enumerating objects: 381, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 381 (delta 0), reused 1 (delta 0), pack-reused 378[K
Receiving objects: 100% (381/381), 534.01 KiB | 12.42 MiB/s, done.
Resolving deltas: 100% (210/210), done.


In [4]:
# Lists the contents of the 'Wav2Lip' directory from Google Drive
!ls /kaggle/working/wave2lip-utils/Wave2Lip_Videos

kennedy.mp4	 kennedy1min.wav  mona.jpg  mona1min.mp4
kennedy1min.mp4  korean.wav	  mona.mp4


In [5]:
cd /kaggle/working/

/kaggle/working


In [6]:
!ls

Wav2Lip  wave2lip-utils


In [7]:
# Copies the pre-trained model file 'wav2lip_gan.pth' from Google Drive to the 'checkpoints' directory
!cp -ri "/kaggle/working/wave2lip-utils/Wave2Lip_pretrained/Wave2Lip_pretrained/wav2lip_gan.pth" /kaggle/working/Wav2Lip/checkpoints

# Get the pre-requisites

In [8]:
# Uninstalls TensorFlow and TensorFlow-GPU packages
# Useful for updating or reinstalling TensorFlow dependencies in the Colab environment
!pip uninstall tensorflow -y
!pip uninstall tensorflow-gpu -y

Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0
[0m

In [9]:
# Changes directory to 'Wav2Lip' and installs the required Python packages listed in 'requirements.txt'
# Ensures all necessary dependencies are installed for running the Wav2Lip model
!cd Wav2Lip && pip install -r requirements.txt

Collecting librosa==0.7.0 (from -r requirements.txt (line 1))
  Downloading librosa-0.7.0.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting numpy==1.17.1 (from -r requirements.txt (line 2))
  Downloading numpy-1.17.1.zip (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m85.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[31mERROR: Ignored the following yanked versions: 3.4.11.39[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement opencv-python==4.1.0.25 (from versions: 3.4.0.14, 3.4.10.37, 3.4.11.41, 3.4.11.43, 3.4.11.45, 3.4.13.47, 3.4.15.55, 3.4.16.57, 3.4.16.59, 3.4.17.61, 3.4.17.63, 3.4.18.65, 4.3.0.38, 4.4.0.40, 4.4.0.42, 4.4.0.44, 4.4.0.46, 4.5.1.48, 4.5.3.56, 4.5.4.58, 4.5.4.60, 4.5.5.62, 4.

In [15]:
!pip install tqdm



In [16]:
# Downloads the pre-trained face detection model 's3fd.pth' from Adrian Bulat's website
# This model is used for face detection in the Wav2Lip project
!wget "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" -O "Wav2Lip/face_detection/detection/sfd/s3fd.pth"

--2024-04-18 18:22:57--  https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth
Resolving www.adrianbulat.com (www.adrianbulat.com)... 45.136.29.207
Connecting to www.adrianbulat.com (www.adrianbulat.com)|45.136.29.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 89843225 (86M) [application/octet-stream]
Saving to: 'Wav2Lip/face_detection/detection/sfd/s3fd.pth'


2024-04-18 18:22:57 (217 MB/s) - 'Wav2Lip/face_detection/detection/sfd/s3fd.pth' saved [89843225/89843225]



# Now lets try!

In [17]:
import os
os.mkdir("sample_data")
!cp "/kaggle/working/wave2lip-utils/Wave2Lip_Videos/kennedy1min.wav" sample_data/
!ls sample_data/

kennedy1min.wav


In [20]:
os.remove("/kaggle/working/Wav2Lip/audio.py")

In [27]:
!cp "/kaggle/working/wave2lip-utils/Wave2Lip_Videos/mona.mp4" /kaggle/working/sample_data/
!ls /kaggle/working/sample_data

kennedy1min.wav  mona.mp4


In [22]:
!cp "/kaggle/input/audio/audio.py" /kaggle/working/Wav2Lip/

In [23]:
!ls /kaggle/working/Wav2Lip/

README.md		face_detection	     models	       temp
audio.py		filelists	     mona.mp4	       wav2lip_train.py
checkpoints		hparams.py	     preprocess.py
color_syncnet_train.py	hq_wav2lip_train.py  requirements.txt
evaluation		inference.py	     results


In [29]:
# os.remove("/kaggle/working/Wav2Lip/audio.py")

In [28]:
!cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face "../sample_data/mona.mp4" --audio "../sample_data/kennedy1min.wav"

Using cuda for inference.
Reading video frames...
Number of frames available for inference: 479
(80, 6402)
Length of mel chunks: 2394
  0%|                                                    | 0/19 [00:00<?, ?it/s]
  0%|                                                    | 0/30 [00:00<?, ?it/s][A
  3%|█▍                                          | 1/30 [00:19<09:31, 19.69s/it][A
  7%|██▉                                         | 2/30 [00:21<04:17,  9.19s/it][A
 10%|████▍                                       | 3/30 [00:23<02:35,  5.77s/it][A
 13%|█████▊                                      | 4/30 [00:24<01:48,  4.17s/it][A
 17%|███████▎                                    | 5/30 [00:26<01:22,  3.28s/it][A
 20%|████████▊                                   | 6/30 [00:28<01:06,  2.75s/it][A
 23%|██████████▎                                 | 7/30 [00:30<00:55,  2.41s/it][A
 27%|███████████▋                                | 8/30 [00:31<00:48,  2.19s/it][A
 30%|█████████████▏          

In [None]:
# use the "files" button on the left to download the result in the Wav2Lip/results/ folder.

## **Variations to try**


1.   Use more padding to include the chin region

In [29]:
!cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face "../sample_data/mona.mp4" --audio "../sample_data/kennedy1min.wav" --pads 0 20 0 0 --resize_factor 2

Using cuda for inference.
Reading video frames...
Number of frames available for inference: 479
(80, 6402)
Length of mel chunks: 2394
  0%|                                                    | 0/19 [00:00<?, ?it/s]
  0%|                                                    | 0/30 [00:00<?, ?it/s][A
  3%|█▍                                          | 1/30 [00:05<02:37,  5.42s/it][A
  7%|██▉                                         | 2/30 [00:05<01:10,  2.52s/it][A
 10%|████▍                                       | 3/30 [00:06<00:43,  1.60s/it][A
 13%|█████▊                                      | 4/30 [00:06<00:30,  1.17s/it][A
 17%|███████▎                                    | 5/30 [00:07<00:23,  1.08it/s][A
 20%|████████▊                                   | 6/30 [00:07<00:18,  1.28it/s][A
 23%|██████████▎                                 | 7/30 [00:08<00:15,  1.46it/s][A
 27%|███████████▋                                | 8/30 [00:08<00:13,  1.60it/s][A
 30%|█████████████▏          

2.   Use resize_factor to reduce the video resolution, as there is a change you might get better results for lower resolution videos. Why? Because the model was trained on low resolution faces.

In [30]:
!cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face "../sample_data/mona.mp4" --audio "../sample_data/kennedy1min.wav" --resize_factor 2

Using cuda for inference.
Reading video frames...
Number of frames available for inference: 479
(80, 6402)
Length of mel chunks: 2394
  0%|                                                    | 0/19 [00:00<?, ?it/s]
  0%|                                                    | 0/30 [00:00<?, ?it/s][A
  3%|█▍                                          | 1/30 [00:05<02:37,  5.44s/it][A
  7%|██▉                                         | 2/30 [00:05<01:10,  2.53s/it][A
 10%|████▍                                       | 3/30 [00:06<00:43,  1.61s/it][A
 13%|█████▊                                      | 4/30 [00:06<00:30,  1.17s/it][A
 17%|███████▎                                    | 5/30 [00:07<00:23,  1.08it/s][A
 20%|████████▊                                   | 6/30 [00:07<00:18,  1.29it/s][A
 23%|██████████▎                                 | 7/30 [00:08<00:15,  1.46it/s][A
 27%|███████████▋                                | 8/30 [00:08<00:13,  1.61it/s][A
 30%|█████████████▏          

# Let's adapt the inference code for Wav2Lip to export the PyTorch model and then convert it to Core ML:**bold text**

In [38]:
!zip -r file.zip /kaggle/working/

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/Wav2Lip/ (stored 0%)
  adding: kaggle/working/Wav2Lip/README.md (deflated 58%)
  adding: kaggle/working/Wav2Lip/filelists/ (stored 0%)
  adding: kaggle/working/Wav2Lip/filelists/README.md (stored 0%)
  adding: kaggle/working/Wav2Lip/wav2lip_train.py (deflated 72%)
  adding: kaggle/working/Wav2Lip/.git/ (stored 0%)
  adding: kaggle/working/Wav2Lip/.git/config (deflated 31%)
  adding: kaggle/working/Wav2Lip/.git/objects/ (stored 0%)
  adding: kaggle/working/Wav2Lip/.git/objects/pack/ (stored 0%)
  adding: kaggle/working/Wav2Lip/.git/objects/pack/pack-180dd72707bcd892560f4775164c5d10c25b402d.idx (deflated 6%)
  adding: kaggle/working/Wav2Lip/.git/objects/pack/pack-180dd72707bcd892560f4775164c5d10c25b402d.pack (deflated 1%)
  adding: kaggle/working/Wav2Lip/.git/objects/info/ (stored 0%)
  adding: kaggle/working/Wav2Lip/.git/description (deflated 14%)
  adding: kaggle/working/Wav2Lip/.git/branches/ (stored 0%)
  adding: kaggle/w

In [39]:
from IPython.display import FileLink 
FileLink(r'file.zip')

In [32]:
!pip install coremltools

Collecting coremltools
  Downloading coremltools-7.1-cp310-none-manylinux1_x86_64.whl.metadata (2.4 kB)
Collecting cattrs (from coremltools)
  Downloading cattrs-23.2.3-py3-none-any.whl.metadata (10 kB)
Downloading coremltools-7.1-cp310-none-manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading cattrs-23.2.3-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cattrs, coremltools
Successfully installed cattrs-23.2.3 coremltools-7.1


In [38]:
!cd /kaggle/working/Wav2Lip/Wave2Lip

/bin/bash: line 0: cd: /kaggle/working/Wav2Lip/Wave2Lip: No such file or directory


In [41]:
%cd Wav2Lip

/kaggle/working/Wav2Lip


In [42]:
!ls

README.md		evaluation	     inference.py      results
__pycache__		face_detection	     models	       temp
audio.py		filelists	     mona.mp4	       wav2lip_train.py
checkpoints		hparams.py	     preprocess.py
color_syncnet_train.py	hq_wav2lip_train.py  requirements.txt


In [43]:
import torch
import coremltools as ct
from models.wav2lip import Wav2Lip

In [46]:
checkpoint_path = "/kaggle/working/Wav2Lip/checkpoints/wav2lip_gan.pth"
wav2lip_model = Wav2Lip()
checkpoint = torch.load(checkpoint_path)
wav2lip_model.load_state_dict(checkpoint['state_dict'])
wav2lip_model.eval()

Wav2Lip(
  (face_encoder_blocks): ModuleList(
    (0): Sequential(
      (0): Conv2d(
        (conv_block): Sequential(
          (0): Conv2d(6, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
          (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): ReLU()
      )
    )
    (1): Sequential(
      (0): Conv2d(
        (conv_block): Sequential(
          (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): ReLU()
      )
      (1): Conv2d(
        (conv_block): Sequential(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): ReLU()
      )
      (2): Conv2d(
        (conv_block): Sequential(
          (0): Conv2d(32, 32, kernel_size=(3, 3

In [47]:
# Define example input shapes for face and audio (adjust according to your model's input shape)
example_face_input = torch.randn(1, 6, 96, 96)  # Example face input shape: [batch_size, num_channels, height, width]
example_audio_input = torch.randn(1, 1, 100, 40)  # Example audio input shape: [batch_size, num_channels, time_steps, features]

# Convert to CoreML format
coreml_model = ct.convert(wav2lip_model, inputs=[
    ct.TensorType(name="face_input", shape=example_face_input.shape),
    ct.TensorType(name="audio_input", shape=example_audio_input.shape)
])

# Save the CoreML model to a file
coreml_model.save("converted_wav2lip_model.mlmodel")


ValueError: Unable to determine the type of the model, i.e. the source framework. Please provide the value of argument "source", from one of ["tensorflow", "pytorch", "milinternal"]. Note that model conversion requires the source package that generates the model. Please make sure you have the appropriate version of source package installed. E.g., if you're converting model originally trained with TensorFlow 1.14, make sure you have `tensorflow==1.14` installed.

In [48]:
import torch
import coremltools as ct
from models.wav2lip import Wav2Lip  # Import the Wav2Lip model definition from the cloned repository

# Load the Wav2Lip model checkpoint
checkpoint_path = "checkpoints/wav2lip_gan.pth"
wav2lip_model = Wav2Lip()
checkpoint = torch.load(checkpoint_path)
wav2lip_model.load_state_dict(checkpoint['state_dict'])
wav2lip_model.eval()

# Define example input shapes for face and audio (adjust according to your model's input shape)
example_face_input = torch.randn(1, 6, 96, 96)  # Example face input shape: [batch_size, num_channels, height, width]
example_audio_input = torch.randn(1, 1, 100, 40)  # Example audio input shape: [batch_size, num_channels, time_steps, features]

# Convert to CoreML format
coreml_model = ct.convert(wav2lip_model, inputs=[
    ct.TensorType(name="face_input", shape=example_face_input.shape),
    ct.TensorType(name="audio_input", shape=example_audio_input.shape)
], source='pytorch')

# Save the CoreML model to a file
coreml_model.save("converted_wav2lip_model.mlmodel")


TypeError: @model must either be a TorchScript object (or .pt or .pth file) or an ExportedProgram object (if using torch.export based API), received: <class 'models.wav2lip.Wav2Lip'>

In [49]:
import torch
import coremltools as ct
from models.wav2lip import Wav2Lip  # Import the Wav2Lip model definition from the cloned repository

# Load the Wav2Lip model checkpoint
checkpoint_path = "checkpoints/wav2lip_gan.pth"
wav2lip_model = Wav2Lip()
checkpoint = torch.load(checkpoint_path)
wav2lip_model.load_state_dict(checkpoint['state_dict'])
wav2lip_model.eval()

# Convert the PyTorch model to TorchScript
torchscript_model = torch.jit.trace(wav2lip_model, (torch.randn(1, 6, 96, 96), torch.randn(1, 1, 100, 40)))

# Define example input shapes for face and audio (adjust according to your model's input shape)
example_face_input = torch.randn(1, 6, 96, 96)  # Example face input shape: [batch_size, num_channels, height, width]
example_audio_input = torch.randn(1, 1, 100, 40)  # Example audio input shape: [batch_size, num_channels, time_steps, features]

# Convert to CoreML format
coreml_model = ct.convert(torchscript_model, inputs=[
    ct.TensorType(name="face_input", shape=example_face_input.shape),
    ct.TensorType(name="audio_input", shape=example_audio_input.shape)
], source='pytorch')

# Save the CoreML model to a file
coreml_model.save("converted_wav2lip_model.mlmodel")


RuntimeError: Given groups=1, weight of size [32, 1, 3, 3], expected input[1, 6, 96, 96] to have 1 channels, but got 6 channels instead

In [50]:
import torch
import coremltools as ct
from models.wav2lip import Wav2Lip  # Import the Wav2Lip model definition from the cloned repository

# Load the Wav2Lip model checkpoint
checkpoint_path = "checkpoints/wav2lip_gan.pth"
wav2lip_model = Wav2Lip()
checkpoint = torch.load(checkpoint_path)
wav2lip_model.load_state_dict(checkpoint['state_dict'])
wav2lip_model.eval()

# Convert the PyTorch model to TorchScript
torchscript_model = torch.jit.trace(wav2lip_model, (torch.randn(1, 6, 96, 96), torch.randn(1, 1, 100, 40)))

# Define example input shapes for face and audio (adjust according to your model's input shape)
example_face_input = torch.randn(1, 6, 96, 96)  # Example face input shape: [batch_size, num_channels, height, width]
example_audio_input = torch.randn(1, 1, 100, 40)  # Example audio input shape: [batch_size, num_channels, time_steps, features]

# Convert to CoreML format
coreml_model = ct.convert(torchscript_model, inputs=[
    ct.TensorType(name="face_input", shape=example_face_input.shape),
    ct.TensorType(name="audio_input", shape=example_audio_input.shape)
], source='pytorch')

# Save the CoreML model to a file
coreml_model.save("converted_wav2lip_model.mlmodel")


RuntimeError: Given groups=1, weight of size [32, 1, 3, 3], expected input[1, 6, 96, 96] to have 1 channels, but got 6 channels instead

In [51]:
import torch
import coremltools as ct
from models.wav2lip import Wav2Lip  # Import the Wav2Lip model definition from the cloned repository

# Load the Wav2Lip model checkpoint
checkpoint_path = "checkpoints/wav2lip_gan.pth"
wav2lip_model = Wav2Lip()
checkpoint = torch.load(checkpoint_path)
wav2lip_model.load_state_dict(checkpoint['state_dict'])
wav2lip_model.eval()

# Define a new model with the correct input channels for the audio_encoder
class FixedAudioEncoder(torch.nn.Module):
    def __init__(self):
        super(FixedAudioEncoder, self).__init__()
        self.audio_encoder = wav2lip_model.audio_encoder
        self.audio_encoder[0] = torch.nn.Conv2d(6, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)

    def forward(self, x):
        return self.audio_encoder(x)

# Convert the PyTorch model to TorchScript
torchscript_model = torch.jit.trace(FixedAudioEncoder(), (torch.randn(1, 6, 96, 96), torch.randn(1, 1, 100, 40)))

# Define example input shapes for face and audio (adjust according to your model's input shape)
example_face_input = torch.randn(1, 6, 96, 96)  # Example face input shape: [batch_size, num_channels, height, width]
example_audio_input = torch.randn(1, 1, 100, 40)  # Example audio input shape: [batch_size, num_channels, time_steps, features]

# Convert to CoreML format
coreml_model = ct.convert(torchscript_model, inputs=[
    ct.TensorType(name="face_input", shape=example_face_input.shape),
    ct.TensorType(name="audio_input", shape=example_audio_input.shape)
], source='pytorch')

# Save the CoreML model to a file
coreml_model.save("converted_wav2lip_model.mlmodel")


TypeError: FixedAudioEncoder.forward() takes 2 positional arguments but 3 were given

In [72]:
import torch
import coremltools as ct
from models.wav2lip import Wav2Lip  # Import the Wav2Lip model definition from the cloned repository

# Load the Wav2Lip model checkpoint
checkpoint_path = "checkpoints/wav2lip_gan.pth"
wav2lip_model = Wav2Lip()
checkpoint = torch.load(checkpoint_path)
wav2lip_model.load_state_dict(checkpoint['state_dict'])
wav2lip_model.eval()



Wav2Lip(
  (face_encoder_blocks): ModuleList(
    (0): Sequential(
      (0): Conv2d(
        (conv_block): Sequential(
          (0): Conv2d(6, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
          (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): ReLU()
      )
    )
    (1): Sequential(
      (0): Conv2d(
        (conv_block): Sequential(
          (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): ReLU()
      )
      (1): Conv2d(
        (conv_block): Sequential(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): ReLU()
      )
      (2): Conv2d(
        (conv_block): Sequential(
          (0): Conv2d(32, 32, kernel_size=(3, 3

In [86]:
# Set the model in evaluation mode.
# Trace the model with random data.

example_audio_input = torch.rand(16, 1,  80, 16)  # Adjust parameters as needed (B, T, 1, 80, 16)
example_face_input = torch.rand(16, 3, 224, 224)  # Adjust parameters as needed, replace 3 with appropriate number of channels for your face sequence
def wrapper_fn(inputs):
  audio_sequences = inputs["audio_sequences"]
  face_sequences = inputs["face_sequences"]
  return wav2lip_model(audio_sequences, face_sequences)


trace_model = torch.jit.trace(wrapper_fn, {"audio_sequences": example_audio_input, "face_sequences": example_face_input})


RuntimeError: Cannot insert a Tensor that requires grad as a constant. Consider making it a parameter or input, or detaching the gradient
Tensor:
(1,1,.,.) = 
 -0.2849  0.0494  0.1350
 -0.1719 -0.0786  0.3059
 -0.2688 -0.1040 -0.1412

(2,1,.,.) = 
  0.0833 -0.0769  0.0045
  0.1553 -0.1660 -0.2939
  0.3756  0.0214 -0.0273

(3,1,.,.) = 
  0.1263  0.1586  0.0319
 -0.2972  0.0231 -0.2831
  0.1649 -0.0896  0.2043

(4,1,.,.) = 
 -0.2625  0.2454  0.0833
  0.3284  0.0173  0.1407
  0.2964 -0.1439 -0.3059

(5,1,.,.) = 
 -0.2444  0.3121  0.0638
  0.0205  0.0466  0.0344
  0.0894 -0.2584 -0.0880

(6,1,.,.) = 
 -0.3399  0.3618  0.0560
 -0.0343  0.0467  0.2836
  0.1662  0.1220 -0.0975

(7,1,.,.) = 
  0.3211  0.0611 -0.1074
  0.3291  0.2271 -0.2872
  0.1964  0.1208  0.2410

(8,1,.,.) = 
 -0.0673  0.0224 -0.3329
  0.2593 -0.2240  0.3489
 -0.2244  0.1752  0.1792

(9,1,.,.) = 
 -0.0195 -0.2636  0.1705
 -0.2485 -0.0753  0.1470
  0.2069  0.1935  0.1350

(10,1,.,.) = 
  0.0277 -0.3266  0.2836
  0.2555 -0.0924  0.3044
 -0.0215  0.0128 -0.3055

(11,1,.,.) = 
 -0.1421  0.3573  0.1551
 -0.0064 -0.1837 -0.0043
 -0.0075 -0.2038 -0.0954

(12,1,.,.) = 
  0.1576  0.2724 -0.2466
 -0.2031  0.3189 -0.2337
  0.0005 -0.0293  0.1977

(13,1,.,.) = 
  0.1997 -0.1819  0.0354
 -0.2503  0.0650  0.3838
 -0.2805  0.3555 -0.2611

(14,1,.,.) = 
 -0.0901 -0.2149  0.1347
 -0.0054 -0.3018 -0.2811
 -0.1563 -0.0974  0.1974

(15,1,.,.) = 
 -0.0457 -0.0266 -0.3267
  0.0327  0.1611  0.3509
 -0.2634 -0.2615  0.2486

(16,1,.,.) = 
  0.3424 -0.0477 -0.2168
  0.3080 -0.0064 -0.0506
 -0.1473 -0.1706 -0.0963

(17,1,.,.) = 
  0.2850 -0.0511 -0.2159
 -0.1547 -0.0721  0.2407
 -0.0228 -0.0649  0.0787

(18,1,.,.) = 
 -0.0988  0.0618 -0.2121
 -0.2173 -0.2498  0.1344
 -0.3672 -0.1525  0.0710

(19,1,.,.) = 
 -0.0921  0.1980 -0.2857
  0.1227  0.2569 -0.2657
 -0.0266  0.1016  0.0310

(20,1,.,.) = 
 -0.0219  0.0017  0.2221
  0.1637  0.2949 -0.1231
 -0.1397 -0.2286 -0.1036

(21,1,.,.) = 
  0.1868 -0.2839 -0.1238
 -0.1917 -0.3410  0.2778
 -0.0039  0.1756 -0.1072

(22,1,.,.) = 
 -0.1192  0.0576  0.2964
  0.0853 -0.2826 -0.2292
  0.2484 -0.0447  0.1444

(23,1,.,.) = 
 -0.2743  0.0993  0.1740
 -0.1291  0.1637 -0.1940
  0.2016  0.0906 -0.1624

(24,1,.,.) = 
 -0.1450  0.2674  0.2532
 -0.0734  0.1401  0.0449
  0.0763  0.0858  0.0351

(25,1,.,.) = 
  0.0946  0.3474  0.0469
  0.2296  0.0645  0.2560
  0.1764 -0.1430 -0.2852

(26,1,.,.) = 
  0.2739  0.2612  0.1166
 -0.1493 -0.2555 -0.0334
 -0.0275 -0.0515 -0.1298

(27,1,.,.) = 
  0.1236 -0.2213 -0.3028
 -0.1496  0.1639  0.1085
  0.0094  0.1025  0.1916

(28,1,.,.) = 
  0.2775  0.3092  0.2545
 -0.0932  0.0014  0.2236
 -0.2240 -0.3402  0.2044

(29,1,.,.) = 
 -0.0491  0.1747 -0.0440
  0.2268 -0.2758 -0.1798
  0.0849 -0.0054 -0.1528

(30,1,.,.) = 
  0.0593  0.3520  0.0552
  0.2907 -0.2408 -0.0040
 -0.3094 -0.2979  0.1228

(31,1,.,.) = 
 -0.0011  0.3318 -0.2145
 -0.3360  0.1518  0.3259
  0.2134 -0.1470 -0.2856

(32,1,.,.) = 
  0.3212  0.0190  0.1875
 -0.1264 -0.2413 -0.2956
 -0.0581 -0.3145  0.1649
[ torch.FloatTensor{32,1,3,3} ]

In [67]:
os.remove("/kaggle/working/Wav2Lip/models/wav2lip.py")
!cp "/kaggle/input/modelfile/wav2lip.py" /kaggle/working/Wav2Lip/models
!ls /kaggle/working/Wav2Lip/models

__init__.py  __pycache__  conv.py  syncnet.py  wav2lip.py


In [71]:
import torch
import coremltools as ct

# Instantiate the model
model = Wav2Lip()

# Load the state dict if needed
# model.load_state_dict(model_state_dict)

# Convert the model to TorchScript if not already
if not isinstance(model, torch.jit.ScriptModule):
    model = torch.jit.script(model)

# Define input and output shapes
input_shape = (batch_size, sequence_length, num_channels_audio, height_audio, width_audio)
output_shape = (batch_size, num_channels_face, sequence_length, height_face, width_face)

# Define input and output names
input_name = "audio_sequences"
output_name = "output"

# Define placeholder values for input and output shapes
batch_size = 1
sequence_length = 10
num_channels_audio = 1
height_audio = 80
width_audio = 16
num_channels_face = 3
height_face = 96
width_face = 96

# Create a dummy input tensor with placeholder values
audio_sequences = torch.randn(input_shape)

# Define the CoreML model
coreml_model = ct.convert(
    model,
    inputs=[ct.TensorType(shape=input_shape, name=input_name)],
    outputs=[ct.TensorType(shape=output_shape, name=output_name)]
)

# Save the CoreML model
coreml_model.save("wav2lip.mlmodel")


RuntimeError: Can't redefine method: forward on class: __torch__.models.wav2lip.___torch_mangle_1702.Wav2Lip (of Python compilation unit at: 0x569d661453f0)