Automatic DeepFake Creation (Tortoise voice cloning + wav2lip)

Note: A video is required. If an audio is also provided, the voice will be cloned from the audio. The video should have a face looking at all times to the camera.

## **Video (and audio) file should be in Google drive in a folder named 'deepfake'. No other files should exist there**

wav2lip code taken from https://github.com/snehitvaddi/Deepfake-using-Wave2Lip

In [None]:
#@title Upload video.mp4 (video to overlay voice) & voice.mp3 (voice to clone) files - Should be mp3 and mp4, having any name
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
cd gdrive/MyDrive/deepfake

/content/gdrive/MyDrive/deepfake


In [None]:
base_path='/content/gdrive/MyDrive/deepfake' #Specify path of video/audio

In [None]:
#@title Install TTS, pydub to create folders with audio chunks, and moviepy to modify duration of audio/video
!pip install -q pydub==0.25.1 TTS==0.22.0 moviepy==1.0.3

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m245.8/253.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.7/253.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#@title English text that we want to read with the cloned voice - This will be inserted in the video too
text_to_read="Joining two modalities results in a surprising increase in generalization! \
What would happen if we combined them all?" #Text to read - Greek text will result in error - will try to spell each letter.

In [None]:
#@title Rename audio and video files to be used below
#Prompt: the user selects and uploads to google colab one audio and one video file. Rename the audio file to 'input_voice.mp3' and the video to 'input_video.mp4'
import os

# Loop over files in the directory
for file in os.listdir(os.getcwd()):

      filename = os.path.join(base_path, file)

      if filename.endswith('.mp3'):
          new_filename = 'input_voice.mp3'
          os.rename(filename, new_filename)
      if filename.endswith('.mp4'):
          new_filename = 'video_full.mp4'
          os.rename(filename, new_filename)

#If only video is provided:
from moviepy.editor import VideoFileClip

def extract_audio(input_video, output_audio):
    video = VideoFileClip(input_video)
    audio = video.audio
    audio.write_audiofile(output_audio)

# Provide the input video file path and desired output audio file path
input_video = 'video_full.mp4'
output_audio = 'input_voice.mp3'

#Decide if voice will be cloned from video or audio
mp3_check=0
for file in os.listdir(os.getcwd()):
      file_path =  os.path.join(base_path, file)
      if '.mp3' in file_path:
        mp3_check=1

if mp3_check==0:
  print("Voice will be cloned from video")
  extract_audio(input_video, output_audio)

In [None]:
#@title Create folder with 10 secs chunks of audio to be used as input in Tortoise
from pydub import AudioSegment

def split_audio_to_clips(audio_file, output_dir, clip_length=10000, sample_rate=22050):
    # Load the audio file
    audio = AudioSegment.from_mp3(audio_file)

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Calculate the total number of clips
    num_clips = len(audio) // clip_length

    # Split the audio into clips and save them as WAV files
    for i in range(num_clips):
        start_time = i * clip_length
        end_time = start_time + clip_length
        clip = audio[start_time:end_time]

        # Set the sample width to 2 bytes for floating-point format
        clip = clip.set_sample_width(2)

        # Set the sample rate to 22050 Hz
        clip = clip.set_frame_rate(sample_rate)

        # Save the clip as a WAV file
        clip.export(os.path.join(output_dir, f"{i+1}.wav"), format="wav")

    print(f"{num_clips} clips saved in '{output_dir}'.")

if __name__ == "__main__":

    # Replace 'input_audio.mp3' with the name of your MP3 file
    input_audio_file = base_path+'/input_voice.mp3'

    # Replace 'voices' with the desired subdirectory name
    subdirectory_name = base_path+'/voice'

    split_audio_to_clips(input_audio_file, subdirectory_name)

121 clips saved in '/content/gdrive/MyDrive/deepfake/voice'.


In [None]:
#@title Download and run TTS tortoise model
from TTS.api import TTS
tts = TTS("tts_models/en/multi-dataset/tortoise-v2")

 > tts_models/en/multi-dataset/tortoise-v2 is already downloaded.
 > Using model: tortoise


In [None]:
#Tortoise Fastest Inference from script (~2min in Colab) - Can also be downloaded from https://huggingface.co/jbetker/tortoise-tts-v2/tree/main
import torch
import torchaudio

from TTS.tts.configs.tortoise_config import TortoiseConfig
from TTS.tts.models.tortoise import Tortoise

config = TortoiseConfig()
model = Tortoise.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="/root/.local/share/tts/tts_models--en--multi-dataset--tortoise-v2", eval=True) #Deepspeed doesn't work
model.cuda()

# cloning a speaker
output_dict = model.synthesize(text_to_read, config, speaker_id="voice", voice_dirs=base_path)

#Save result
torchaudio.save("tortoise_v2_script.wav", torch.tensor(output_dict["wav"]).squeeze(0), 24000)

Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:863.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]



Generating autoregressive samples..


100%|██████████| 16/16 [01:21<00:00,  5.12s/it]
100%|██████████| 16/16 [00:01<00:00, 11.50it/s]


Transforming autoregressive outputs into audio..


  0%|          | 0/100 [00:00<?, ?it/s]

  torchaudio.save("tortoise_v2_script.wav", torch.tensor(output_dict["wav"]).squeeze(0), 24000)



In [None]:
from google.colab import files
files.download(base_path+'/tortoise_v2_script.wav')

In [None]:
#@title Confirm that audio same length as video. If not, keep the smallest one and cut the other or cut them both to 20secs. This is needed for wav2lip to work
#ChatGPT Prompt: Create python code that compares an audio.wav with a video.mp4 files and if the duration of one is bigger than the other,
# it cuts the largest one to be the same duration as the smallest. If any of them is bigger than 20secs then raise an error

#Needed to avoid errors with encoding
import locale
locale.getpreferredencoding = lambda: "UTF-8"

from moviepy.editor import VideoFileClip, AudioFileClip

def compare_audio_video_duration(audio_file, video_file):
    audio = AudioFileClip(audio_file)
    video = VideoFileClip(video_file)

    audio_duration = audio.duration
    video_duration = video.duration

    # Either the video or audio should be <20 secs. If any of these is larger than that, it will be cut to the duration of the other or to 20secs.
    # Might work for up to 30secs, but not guaranteed. If only video is provided, it will keep only the first 20 secs of it.

    # if audio_duration > 20 and video_duration > 20:
    #     video = video.subclip(0, 20)
    #     video.write_videofile('input_video.mp4')
    #     audio = audio.subclip(0, 20)
    #     audio.write_audiofile('input_audio.wav')

    if audio_duration != video_duration:
        min_duration = min(audio_duration, video_duration)
        if min_duration == audio_duration:
            video = video.subclip(0, min_duration)
            video.write_videofile('input_video.mp4')
            os.rename(audio_file,'input_audio.wav')
        else:
            audio = audio.subclip(0, min_duration)
            audio.write_audiofile('input_audio.wav')
            os.rename(video_file,'input_video.mp4')

    audio.close()
    video.close()

# Example usage
compare_audio_video_duration("tortoise_v2_script.wav", "video_full.mp4") #input_voice.mp3

Moviepy - Building video input_video.mp4.
MoviePy - Writing audio in input_videoTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video input_video.mp4





Moviepy - Done !
Moviepy - video ready input_video.mp4




---



In [None]:
cd /content

In [None]:
#@title <h1>Install Wav2Lip</h1>
#@markdown * Install dependencies
#@markdown * Download models
# !rm -rf /content/sample_data
# !mkdir /content/sample_data

!git clone https://github.com/zabique/Wav2Lip

#download the pretrained model
!wget 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA' -O '/content/Wav2Lip/checkpoints/wav2lip_gan.pth'
!wget 'https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?e=TBFBVW' -O /content/Wav2Lip/checkpoints/wav2lip.pth
!pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl

# !pip uninstall tensorflow tensorflow-gpu
!cd Wav2Lip && pip install -r requirements.txt

#download pretrained model for face detection
!wget "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" -O "/content/Wav2Lip/face_detection/detection/sfd/s3fd.pth"

!pip install -q youtube-dl
!pip install ffmpeg-python
from IPython.display import clear_output
clear_output()
print("\nDone")


Done




---



- Below implementation needs both audio and video to be of same length. Only specific extensions work (mp4 and wav)
- Target face in the input_video.mp4, must be "detectable" in ALL videoframes (So no black or blurry frames etc)
- wav2lip does not like very long and high res clips (1080p/30seconds max)
- 'Wav2Lip' model gives highly accurate lip-sync compared to 'Wav2Lip + GAN' but with inferior visual quality compared to the latter

Below is needed to fix an error in loading - Not added in the beginning due to conflict in dependencies

In [None]:
!pip install librosa==0.9.1

Collecting librosa==0.9.1
  Using cached librosa-0.9.1-py3-none-any.whl (213 kB)
Installing collected packages: librosa
  Attempting uninstall: librosa
    Found existing installation: librosa 0.10.0
    Uninstalling librosa-0.10.0:
      Successfully uninstalled librosa-0.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tts 0.22.0 requires librosa>=0.10.0, but you have librosa 0.9.1 which is incompatible.[0m[31m
[0mSuccessfully installed librosa-0.9.1


In [None]:
#@title Create Wav2Lip video (using wav2lip_gan.pth) GAN
!cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face '/content/gdrive/MyDrive/deepfake/input_video.mp4' --audio '/content/gdrive/MyDrive/deepfake/input_audio.wav' --resize_factor 2

#Use --resize_factor 2 otherwise OOM error. Use resize_factor to reduce the video resolution, as there is a chance you might get better results for lower resolution videos.
# This might be related with the model which might have been trained on low resolution faces.

Using cuda for inference.
Reading video frames...
Number of frames available for inference: 207
  return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels,
(80, 550)
Length of mel chunks: 202
  0% 0/2 [00:00<?, ?it/s]Downloading: "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" to /root/.cache/torch/hub/checkpoints/s3fd-619a316812.pth

  0% 0.00/85.7M [00:00<?, ?B/s][A
  0% 32.0k/85.7M [00:00<06:36, 226kB/s][A
  0% 80.0k/85.7M [00:00<05:07, 292kB/s][A
  0% 160k/85.7M [00:00<03:35, 416kB/s] [A
  0% 336k/85.7M [00:00<02:00, 741kB/s][A
  1% 688k/85.7M [00:00<01:05, 1.37MB/s][A
  1% 1.25M/85.7M [00:00<00:38, 2.32MB/s][A
  3% 2.53M/85.7M [00:01<00:19, 4.59MB/s][A
  6% 4.77M/85.7M [00:01<00:10, 8.26MB/s][A
  8% 6.84M/85.7M [00:01<00:07, 10.4MB/s][A
 11% 9.12M/85.7M [00:01<00:06, 12.3MB/s][A
 14% 11.6M/85.7M [00:01<00:05, 14.0MB/s][A
 17% 14.2M/85.7M [00:01<00:04, 15.4MB/s][A
 20% 16.7M/85.7M [00:01<00:04, 16.3MB/s][A
 23% 19.3M/85.7M [00:02<

In [None]:
#@title Play result video -  50% scaling
from IPython.display import HTML
from base64 import b64encode
mp4 = open('/content/Wav2Lip/results/result_voice.mp4','rb').read()

data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"""
<video width="50%" height="50%" controls>
      <source src="{data_url}" type="video/mp4">
</video>""")

In [None]:
#@title Download Result.mp4 to your computer
# from google.colab import files
files.download('/content/Wav2Lip/results/result_voice.mp4') #Only after the last cell is executed this will start

In [None]:
# #@title Delete old uploaded samples & result files, so you can start over again.
# # %rm /content/sample_data/*
# %rm /content/Wav2Lip/results/*
# from IPython.display import clear_output
# clear_output()
# print("\nDone! now press X")

# **Variations to try**


In [None]:
# #@title Create Wav2Lip video using wav2lip.pth
# !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip.pth --face "/content/sample_data/input_video.mp4" --audio "/content/sample_data/input_audio.wav"  --resize_factor 2

In [None]:
#@title Use more padding to include the chin region (you can manually edit pads dimensions viewing and changing the code)
# !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face "/content/sample_data/input_video.mp4" --audio "/content/sample_data/input_audio.wav" --pads 0 20 0 0 --resize_factor 2

In [None]:
# #@title Play result video -  50% scaling
# from IPython.display import HTML
# from base64 import b64encode
# mp4 = open('/content/Wav2Lip/results/result_voice.mp4','rb').read()
# data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
# HTML(f"""
# <video width="50%" height="50%" controls>
#       <source src="{data_url}" type="video/mp4">
# </video>""")

In [None]:
#@title Download Result.mp4 to your computer
# from google.colab import files
# files.download('/content/Wav2Lip/results/result_voice.mp4')