# Video translation with speaker diarization and voice cloning

In [1]:
!pip install kaleido
!pip install openai
!pip install tiktoken
!pip install cohere

!pip install -q -U gradio
!pip install git+https://github.com/m-bain/whisperx.git
!pip install -q -U moviepy
!pip install -q -U deepl
!pip install -q -U librosa
!pip install -q -U TTS

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvicorn, which is not installed.[0m[31m
[0mSuccessfully installed kaleido-0.2.1
Collecting openai
  Downloading openai-1.2.1-py3-none-any.whl (220 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.2/220.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.25.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.8/899.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setu

In [2]:
import os
import gradio as gr
import whisperx
import moviepy.editor as mp
import deepl
import torch
import librosa
from TTS.api import TTS

  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")


In [7]:
HF_TOKEN = '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'

from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

# Define function to extract audio from video
def extract_audio(video_path):
  clip = mp.VideoFileClip(video_path)
  audio_path = os.path.splitext(video_path)[0] + ".wav"
  clip.audio.write_audiofile(audio_path)
  return audio_path

# Define function to perform speech diarization
def speech_diarization(audio_path, hf_token):
  device = "cuda"
  batch_size = 16
  compute_type = "float16"
  model = whisperx.load_model("large-v2", device, compute_type=compute_type)

  audio = whisperx.load_audio(audio_path)
  result = model.transcribe(audio, batch_size=batch_size)
  print(result["segments"])

  # delete model if low on GPU resources
  import gc; gc.collect(); torch.cuda.empty_cache(); del model

  # 2. Align whisper output
  model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
  result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

  print(result["segments"]) # after alignment

  # delete model if low on GPU resources
  import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

  # 3. Assign speaker labels
  diarize_model = whisperx.DiarizationPipeline(model_name='pyannote/speaker-diarization@2.1', use_auth_token=hf_token, device=device)

  # add min/max number of speakers if known
  diarize_segments = diarize_model(audio)
  # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

  result = whisperx.assign_word_speakers(diarize_segments, result)
  print(diarize_segments)
  print(result["segments"])

  return result["segments"]

def speaker_voice_cuts(transcription, audio_path):
  # Find 3 uninterrupted snippets for each speaker in word transcriptions
  #

# Define function to perform text translation
def text_translation(text, target_language):
    translation = deepl.translate(text, target_language)
    return translation

# Define function to perform voice cloning
def voice_cloning(text):
  device = "cuda"
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

  return audio

# Define function to adjust voice pace
def adjust_voice_pace(audio_path, target_duration):
    y, sr = librosa.load(audio_path, sr=None)
    duration = librosa.get_duration(y=y, sr=sr)
    tempo = duration / target_duration
    y_stretched = librosa.effects.time_stretch(y, tempo)
    librosa.output.write_wav(audio_path, y_stretched, sr)
    return audio_path

# Define function to perform video translation
def video_translation(video_path, target_language, hf_token, speaker_model_path=None):

  audio_path = extract_audio(video_path)

  transcription = speech_diarization(audio_path, hf_token)

  speakers_voice_cuts =

  translated_transcription = []
  for text in transcription:
    translated_text = text_translation(text, target_language)
    translated_transcription.append(translated_text)

  cloned_audio_paths = []
  for i, text in enumerate(translated_transcription):
    speaker_model_path_i = speaker_model_path[i] if len(speaker_model_path) > i else speaker_model_path[-1]
    cloned_audio_path = voice_cloning(text, speaker_model_path_i)
    cloned_audio_paths.append(cloned_audio_path)

  target_duration = mp.VideoFileClip(video_path).duration
  for i, audio_path in enumerate(cloned_audio_paths):
    cloned_audio_paths[i] = adjust_voice_pace(audio_path, target_duration)

  video = mp.VideoFileClip(video_path)
  audio_clips = [mp.AudioFileClip(audio_path) for audio_path in cloned_audio_paths]
  audio = mp.concatenate_audioclips(audio_clips)
  video_with_new_audio = video.set_audio(audio)

  video_with_new_audio_path = os.path.splitext(video_path)[0] + "_" + target_language + ".mp4"
  video_with_new_audio.write_videofile(video_with_new_audio_path)

  return video_with_new_audio_path

video_translation('/content/drive/MyDrive/Data/fridman-harris-demo-6min.mp4', 'ru', HF_TOKEN)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
MoviePy - Writing audio in /content/drive/MyDrive/Data/fridman-harris-demo-6min.wav




MoviePy - Done.
No language specified, language will be first be detected for each audio file (increases inference time).


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.0.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.0+cu118. Bad things might happen unless you revert torch to 1.x.
Detected language: en (1.00) in first 30s of audio...
[{'text': " Is there a difference between intellectually knowing free will is an illusion and really experiencing it? What's the longest you've been able to experience, escape the illusion of free will?", 'start': 3.029, 'end': 15.469}, {'text': " Well, it's always obvious to me when I pay attention. Whenever I'm mindful, the term of jargon in the Buddhist and increasingly outside the Buddhist context is mindfulness, right? But there are sort of different levels of mindfulness and there's different", 'start': 16.169, 'end': 35.401}, {'text': " degrees of insight into this. But yes, I mean, what I'm calling evidence of lack of free will and lack of, you know, lack of the self. I've got two s

Downloading pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading (…)/2022.07/config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.0.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.0+cu118. Bad things might happen unless you revert torch to 1.x.


Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

Downloading (…)an_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

Downloading (…)in/label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

                              segment label     speaker       start  \
0   [ 00:00:03.029 -->  00:00:15.503]     X  SPEAKER_01    3.029010   
1   [ 00:00:08.967 -->  00:00:09.291]     A  SPEAKER_00    8.967577   
2   [ 00:00:11.356 -->  00:00:11.424]     B  SPEAKER_00   11.356655   
3   [ 00:00:16.168 -->  00:00:35.349]     C  SPEAKER_00   16.168942   
4   [ 00:00:36.851 -->  00:00:55.401]     D  SPEAKER_00   36.851536   
5   [ 00:00:42.943 -->  00:00:42.960]     Y  SPEAKER_01   42.943686   
6   [ 00:00:53.421 -->  00:00:53.711]     Z  SPEAKER_01   53.421502   
7   [ 00:00:56.663 -->  00:02:50.708]     E  SPEAKER_00   56.663823   
8   [ 00:02:52.465 -->  00:03:00.486]     F  SPEAKER_00  172.465870   
9   [ 00:03:04.018 -->  00:03:15.708]     G  SPEAKER_00  184.018771   
10  [ 00:03:17.738 -->  00:03:27.807]     H  SPEAKER_00  197.738908   
11  [ 00:03:31.629 -->  00:03:43.899]     I  SPEAKER_00  211.629693   
12  [ 00:03:31.749 -->  00:03:32.858]    AA  SPEAKER_01  211.749147   
13  [ 

TypeError: ignored

In [None]:


def translate_video(video_path, target_language, speaker_model_path):
    try:
      video_with_new_audio_path = video_translation(video_path, target_language, speaker_model_path)
    except Exception as e:
      print(f"An error occurred: {e}")
    return gr.components.Video(video_with_new_audio_path)


inputs = [
    gr.Video(label="Select a video file"),
    gr.Dropdown(["uk", "ru"], label="Select target language")
]

outputs = gr.Video(label="Translated video")
gr.Interface(fn=translate_video, inputs=inputs, outputs=outputs, title="AI Video Translation").launch()

