Automatic DeepFake Creation (Tortoise voice cloning + wav2lip)

Note: A video is required. If an audio is also provided, the voice will be cloned from the audio.

# **The video should have a face looking at all times to the camera.**

## **Video (and audio) file should be in Google drive in a folder named 'deepfake'. No other files should exist there**

wav2lip code taken from https://github.com/snehitvaddi/Deepfake-using-Wave2Lip

In [1]:
#@title Upload video.mp4 (video to overlay voice) & voice.mp3 (voice to clone) files - Should be mp3 and mp4, having any name
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
cd gdrive/MyDrive/deepfake

/content/gdrive/MyDrive/deepfake


In [3]:
base_path='/content/gdrive/MyDrive/deepfake' #Specify path of video/audio

In [4]:
#@title Install TTS, pydub to create folders with audio chunks, and moviepy to modify duration of audio/video
!pip install -q pydub==0.25.1 TTS==0.22.0 moviepy==1.0.3

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m938.0/938.0 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.2/51.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [5]:
#@title English text that we want to read with the cloned voice - This will be inserted in the video too
#Text prompt should be separated with '|' every one to two sentences (every ~20secs it takes to read it).
text_to_read="""Joining two modalities results in a surprising increase in generalization!
What would happen if we combined them all? |"""

In [6]:
#@title Rename audio and video files to be used below
#Prompt: the user selects and uploads to google colab one audio and one video file. Rename the audio file to 'input_voice.mp3' and the video to 'input_video.mp4'
import os

# Loop over files in the directory
for file in os.listdir(os.getcwd()):

      filename = os.path.join(base_path, file)

      if filename.endswith('.mp3'):
          new_filename = 'input_voice.mp3'
          os.rename(filename, new_filename)
      if filename.endswith('.mp4'):
          new_filename = 'video_full.mp4'
          os.rename(filename, new_filename)

#If only video is provided:
from moviepy.editor import VideoFileClip

def extract_audio(input_video, output_audio):
    video = VideoFileClip(input_video)
    audio = video.audio
    audio.write_audiofile(output_audio)

# Provide the input video file path and desired output audio file path
input_video = 'video_full.mp4'
output_audio = 'input_voice.mp3'

#Decide if voice will be cloned from video or audio
mp3_check=0
for file in os.listdir(os.getcwd()):
      file_path =  os.path.join(base_path, file)
      if '.mp3' in file_path:
        mp3_check=1

if mp3_check==0:
  print("Voice will be cloned from video")
  extract_audio(input_video, output_audio)

In [7]:
#@title Create folder with 10 secs chunks of audio to be used as input in Tortoise
from pydub import AudioSegment

def split_audio_to_clips(audio_file, output_dir, clip_length=10000, sample_rate=22050):
    # Load the audio file
    audio = AudioSegment.from_mp3(audio_file)

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Calculate the total number of clips
    num_clips = len(audio) // clip_length

    # Split the audio into clips and save them as WAV files
    for i in range(num_clips):
        start_time = i * clip_length
        end_time = start_time + clip_length
        clip = audio[start_time:end_time]

        # Set the sample width to 2 bytes for floating-point format
        clip = clip.set_sample_width(2)

        # Set the sample rate to 22050 Hz
        clip = clip.set_frame_rate(sample_rate)

        # Save the clip as a WAV file
        clip.export(os.path.join(output_dir, f"{i+1}.wav"), format="wav")

    print(f"{num_clips} clips saved in '{output_dir}'.")

if __name__ == "__main__":

    # Replace 'input_audio.mp3' with the name of your MP3 file
    input_audio_file = base_path+'/input_voice.mp3'

    # Replace 'voices' with the desired subdirectory name
    subdirectory_name = base_path+'/voice'

    split_audio_to_clips(input_audio_file, subdirectory_name)

121 clips saved in '/content/gdrive/MyDrive/deepfake/voice'.


In [8]:
#@title Download and run TTS tortoise model
from TTS.api import TTS
tts = TTS("tts_models/en/multi-dataset/tortoise-v2")

 > Downloading model to /root/.local/share/tts/tts_models--en--multi-dataset--tortoise-v2


100%|█████████▉| 1.71G/1.72G [00:24<00:00, 77.2MiB/s]
100%|██████████| 1.72G/1.72G [00:26<00:00, 64.0MiB/s]

  1%|          | 6.54M/976M [00:00<00:14, 65.4MiB/s][A
  2%|▏         | 15.3M/976M [00:00<00:12, 78.7MiB/s][A
  2%|▏         | 24.4M/976M [00:00<00:11, 84.0MiB/s][A
  3%|▎         | 33.0M/976M [00:00<00:11, 84.9MiB/s][A
  4%|▍         | 41.7M/976M [00:00<00:10, 85.7MiB/s][A
  5%|▌         | 50.4M/976M [00:00<00:10, 86.0MiB/s][A
  6%|▌         | 59.1M/976M [00:00<00:10, 86.5MiB/s][A
  7%|▋         | 67.8M/976M [00:00<00:10, 84.5MiB/s][A
  8%|▊         | 76.3M/976M [00:00<00:10, 83.0MiB/s][A
  9%|▊         | 84.8M/976M [00:01<00:10, 83.6MiB/s][A
 10%|▉         | 93.1M/976M [00:01<00:10, 81.8MiB/s][A
 10%|█         | 102M/976M [00:01<00:10, 82.6MiB/s] [A
 11%|█▏        | 110M/976M [00:01<00:10, 82.6MiB/s][A
 12%|█▏        | 118M/976M [00:01<00:12, 68.1MiB/s][A
 13%|█▎        | 125M/976M [00:01<00:12, 68.9MiB/s][A
 14%|█▎        | 133M/976M [00:01<00:11, 71.5MiB/s][A

 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: tortoise


In [9]:
#@title Install tortoise original repository to be used to merge cloned audio pieces together
!pip3 install -U scipy

!git clone https://github.com/jnordberg/tortoise-tts.git
%cd tortoise-tts
!pip3 install -r requirements.txt
!pip3 install transformers==4.19.0 einops==0.5.0 rotary_embedding_torch==0.1.5 unidecode==1.3.5
!python3 setup.py install

Collecting scipy
  Downloading scipy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<2.3,>=1.22.4 (from scipy)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, scipy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.0
    Uninstalling numpy-1.22.0:
      Successfully uninstalled numpy-1.22.0
  Attempting uninstall: scipy
    Found existing installation: scipy 1.11.4
    Uninstalling scipy-1.11.4:
      Successfully uninstalled scipy-1.11.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the 

running install
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer, pypa/build or
        other standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ********************************************************************************
        Please avoid running ``setup.py`` and ``easy_install``.
        Instead, use pypa/build, pypa/installer, pypa/build or
        other standards-based tools.

        See https://github.com/pypa/setuptools/issues/917 for details.
        ********************************************************************************

!!
  self.initialize_options()
running bdist_egg
running egg_info
creating TorToiSe.egg-info
writing TorToi

In [10]:
from tortoise.utils.text import split_and_recombine_text
from time import time

# Process text
text = text_to_read
if '|' in text:
    print("Found the '|' character in your text, which I will use as a cue for where to split it up. If this was not"
          "your intent, please remove all '|' characters from the input.")
    texts = text.split('|')
else:
    texts = split_and_recombine_text(text) #If only one piece of text (<20secs), convert it to list

Found the '|' character in your text, which I will use as a cue for where to split it up. If this was notyour intent, please remove all '|' characters from the input.


In [11]:
cd ..

/content/gdrive/MyDrive/deepfake


In [None]:
#Tortoise Fastest Inference from script (~2min per 20secs of audio in Colab)
#Can also be downloaded from https://huggingface.co/jbetker/tortoise-tts-v2/tree/main
import torch
import torchaudio

from TTS.tts.configs.tortoise_config import TortoiseConfig
from TTS.tts.models.tortoise import Tortoise
import IPython
import gc

config = TortoiseConfig()
model = Tortoise.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="/root/.local/share/tts/tts_models--en--multi-dataset--tortoise-v2", eval=True) #Deepspeed doesn't work
model.cuda()

all_parts = []
for j, text in enumerate(texts): #Combine individuals pieces of text by processing sound one piece at a time (around 20secs each)
  # cloning a speaker
  output_dict = model.synthesize(text, config, speaker_id="voice", voice_dirs=base_path)

  #Save result
  torchaudio.save("tortoise_v2_script_"+str(j)+".wav", torch.tensor(output_dict["wav"]).squeeze(0), 24000)
  all_parts.append(torch.tensor(output_dict["wav"]).squeeze(0))
  del output_dict

  # Perform garbage collection
  gc.collect()

torchaudio.save('combined.wav', torch.cat(all_parts, dim=-1), 24000)
IPython.display.Audio('combined.wav')

Clean memory so that we can have a bit longer text to read

In [13]:
del all_parts

In [14]:
for name in dir():
    if not name.startswith('_'):
        del globals()[name]

In [15]:
import gc
# Perform garbage collection
gc.collect()

21

In [None]:
from google.colab import files
files.download('/content/gdrive/MyDrive/deepfake/combined.wav')
#Individual pieces can also be found in this folder

In [16]:
#@title Confirm that audio same length as video. If not, keep the smallest one and cut the other (or cut them both to 20secs). This is needed for wav2lip to work
#ChatGPT Prompt: Create python code that compares an audio.wav with a video.mp4 files and if the duration of one is bigger than the other,
# it cuts the largest one to be the same duration as the smallest. If any of them is bigger than 20secs then raise an error

#Needed to avoid errors with encoding
import locale
locale.getpreferredencoding = lambda: "UTF-8"

from moviepy.editor import VideoFileClip, AudioFileClip
import os

def compare_audio_video_duration(audio_file, video_file):
    audio = AudioFileClip(audio_file)
    video = VideoFileClip(video_file)

    audio_duration = audio.duration
    video_duration = video.duration

    # Either the video or audio should be <20 secs. If any of these is larger than that, it will be cut to the duration of the other or to 20secs.
    # Might work for up to 30secs, but not guaranteed. If only video is provided, it will keep only the first 20 secs of it.

    # if audio_duration > 20 and video_duration > 20:
    #     video = video.subclip(0, 20)
    #     video.write_videofile('input_video.mp4')
    #     audio = audio.subclip(0, 20)
    #     audio.write_audiofile('input_audio.wav')

    if audio_duration != video_duration:
        min_duration = min(audio_duration, video_duration)
        if min_duration == audio_duration:
            video = video.subclip(0, min_duration)
            video.write_videofile('input_video.mp4')
            os.rename(audio_file,'input_audio.wav')
        else:
            audio = audio.subclip(0, min_duration)
            audio.write_audiofile('input_audio.wav')
            os.rename(video_file,'input_video.mp4')

    audio.close()
    video.close()

# Example usage
compare_audio_video_duration("combined.wav", "video_full.mp4") #input_voice.mp3

100%|██████████| 4.40k/4.40k [13:18<00:00, 5.51iB/s]

Moviepy - Building video input_video.mp4.
MoviePy - Writing audio in input_videoTEMP_MPY_wvf_snd.mp3



chunk:   0%|          | 0/831 [00:00<?, ?it/s, now=None][A
chunk:   9%|▉         | 77/831 [00:00<00:00, 768.08it/s, now=None][A
chunk:  19%|█▊        | 154/831 [00:01<00:05, 122.73it/s, now=None][A
chunk:  26%|██▋       | 220/831 [00:01<00:03, 185.86it/s, now=None][A
chunk:  36%|███▌      | 298/831 [00:01<00:01, 271.10it/s, now=None][A
chunk:  43%|████▎     | 359/831 [00:01<00:01, 258.97it/s, now=None][A
chunk:  51%|█████     | 424/831 [00:01<00:01, 319.96it/s, now=None][A
chunk:  57%|█████▋    | 477/831 [00:02<00:02, 176.23it/s, now=None][A
chunk:  66%|██████▋   | 551/831 [00:02<00:01, 241.15it/s, now=None][A
chunk:  75%|███████▍  | 622/831 [00:02<00:00, 307.26it/s, now=None][A
chunk:  84%|████████▍ | 702/831 [00:02<00:00, 389.88it/s, now=None][A
chunk:  92%|█████████▏| 766/831 [00:02<00:00, 435.94it/s, now=None][A
chunk: 100%|█████████▉| 829/831 [00:03<00:00, 279.72it/s, now=None][A
100%|██████████| 4.40k/4.40k [13:21<00:00, 5.49iB/s]

MoviePy - Done.
Moviepy - Writing video input_video.mp4




t:   0%|          | 0/1130 [00:00<?, ?it/s, now=None][A
t:   1%|          | 6/1130 [00:00<00:20, 53.94it/s, now=None][A
t:   1%|          | 12/1130 [00:00<00:31, 35.46it/s, now=None][A
t:   1%|▏         | 16/1130 [00:00<00:31, 35.80it/s, now=None][A
t:   2%|▏         | 20/1130 [00:00<00:30, 36.00it/s, now=None][A
t:   2%|▏         | 25/1130 [00:00<00:28, 38.21it/s, now=None][A
t:   3%|▎         | 29/1130 [00:00<00:31, 34.80it/s, now=None][A
t:   3%|▎         | 35/1130 [00:00<00:27, 39.68it/s, now=None][A
t:   4%|▎         | 40/1130 [00:01<00:26, 41.42it/s, now=None][A
t:   4%|▍         | 45/1130 [00:01<00:25, 41.77it/s, now=None][A
t:   4%|▍         | 50/1130 [00:01<00:26, 40.02it/s, now=None][A
t:   5%|▍         | 55/1130 [00:01<00:32, 33.06it/s, now=None][A
t:   5%|▌         | 59/1130 [00:02<01:15, 14.15it/s, now=None][A
t:   5%|▌         | 62/1130 [00:02<01:22, 12.92it/s, now=None][A
t:   6%|▌         | 65/1130 [00:02<01:36, 11.08it/s, now=None][A
t:   6%|▌         |

Moviepy - Done !
Moviepy - video ready input_video.mp4




---



In [18]:
cd /content

/content


In [2]:
#@title <h1>Install Wav2Lip</h1>
#@markdown * Install dependencies
#@markdown * Download models
# !rm -rf /content/sample_data
# !mkdir /content/sample_data

!git clone https://github.com/zabique/Wav2Lip

#download the pretrained model
!wget 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA' -O '/content/Wav2Lip/checkpoints/wav2lip_gan.pth'
!wget 'https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?e=TBFBVW' -O /content/Wav2Lip/checkpoints/wav2lip.pth
!pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl

# !pip uninstall tensorflow tensorflow-gpu
!cd Wav2Lip && pip install -r requirements.txt

#download pretrained model for face detection
!wget "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" -O "/content/Wav2Lip/face_detection/detection/sfd/s3fd.pth"

!pip install -q youtube-dl
!pip install ffmpeg-python
from IPython.display import clear_output
clear_output()
print("\nDone")


Done




---



- Below implementation needs both audio and video to be of same length. Only specific extensions work (mp4 and wav)
- Target face in the input_video.mp4, must be "detectable" in ALL videoframes (So no black or blurry frames etc)
- wav2lip does not like very long and high res clips (1080p/30seconds recommended)
- 'Wav2Lip' model gives highly accurate lip-sync compared to 'Wav2Lip + GAN' but with inferior visual quality compared to the latter

Below is needed to fix an error in loading - Not added in the beginning due to conflict in dependencies

In [3]:
!pip install librosa==0.9.1



In [None]:
# Clear all user-defined variables except built-in ones
for name in dir():
    if not name.startswith('_'):
        del globals()[name]

# Perform garbage collection to free up memory
import gc
gc.collect()

In [4]:
#@title Create Wav2Lip video (using wav2lip_gan.pth) GAN
!cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face '/content/gdrive/MyDrive/deepfake/input_video.mp4' --audio '/content/gdrive/MyDrive/deepfake/input_audio.wav' --resize_factor 2

#Use --resize_factor 2 otherwise OOM error. Use resize_factor to reduce the video resolution, as there is a chance you might get better results for lower resolution videos.
# This might be related with the model which might have been trained on low resolution faces.

Using cuda for inference.
Reading video frames...
Number of frames available for inference: 1130
  return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels,
(80, 3013)
Length of mel chunks: 1126
  0% 0/9 [00:00<?, ?it/s]
  0% 0/71 [00:00<?, ?it/s][A
  1% 1/71 [00:21<25:29, 21.86s/it][A
  3% 2/71 [00:23<11:28,  9.98s/it][A
  4% 3/71 [00:25<07:04,  6.24s/it][A
  6% 4/71 [00:27<05:01,  4.50s/it][A
  7% 5/71 [00:28<03:48,  3.46s/it][A
  8% 6/71 [00:30<03:04,  2.84s/it][A
 10% 7/71 [00:31<02:35,  2.43s/it][A
 11% 8/71 [00:33<02:16,  2.16s/it][A
 13% 9/71 [00:35<02:02,  1.98s/it][A
 14% 10/71 [00:36<01:54,  1.87s/it][A
 15% 11/71 [00:38<01:48,  1.81s/it][A
 17% 12/71 [00:40<01:47,  1.82s/it][A
 18% 13/71 [00:42<01:45,  1.82s/it][A
 20% 14/71 [00:43<01:39,  1.75s/it][A
 21% 15/71 [00:45<01:36,  1.72s/it][A
 23% 16/71 [00:47<01:33,  1.70s/it][A
 24% 17/71 [00:48<01:30,  1.68s/it][A
 25% 18/71 [00:50<01:28,  1.67s/it][A
 27% 19/71 [00:51<01:27,  1.68s/it][A
 2

In [None]:
#@title Play result video -  50% scaling
from IPython.display import HTML
from base64 import b64encode
mp4 = open('/content/Wav2Lip/results/result_voice.mp4','rb').read()

data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"""
<video width="50%" height="50%" controls>
      <source src="{data_url}" type="video/mp4">
</video>""")

In [None]:
#@title Download Result.mp4 to your computer
from google.colab import files
files.download('/content/Wav2Lip/results/result_voice.mp4') #Only after the last cell is executed this will start
files.download('/content/gdrive/MyDrive/deepfake/input_audio.wav')

In [None]:
# #@title Delete old uploaded samples & result files, so you can start over again.
# # %rm /content/sample_data/*
# %rm /content/Wav2Lip/results/*
# from IPython.display import clear_output
# clear_output()
# print("\nDone! now press X")

# **Variations to try**


In [None]:
# #@title Create Wav2Lip video using wav2lip.pth
# !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip.pth --face "/content/sample_data/input_video.mp4" --audio "/content/sample_data/input_audio.wav"  --resize_factor 2

In [None]:
#@title Use more padding to include the chin region (you can manually edit pads dimensions viewing and changing the code)
# !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face "/content/sample_data/input_video.mp4" --audio "/content/sample_data/input_audio.wav" --pads 0 20 0 0 --resize_factor 2

In [None]:
# #@title Play result video -  50% scaling
# from IPython.display import HTML
# from base64 import b64encode
# mp4 = open('/content/Wav2Lip/results/result_voice.mp4','rb').read()
# data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
# HTML(f"""
# <video width="50%" height="50%" controls>
#       <source src="{data_url}" type="video/mp4">
# </video>""")

In [None]:
#@title Download Result.mp4 to your computer
# from google.colab import files
# files.download('/content/Wav2Lip/results/result_voice.mp4')