# Install dependencies

In [1]:
!pip install -r requirements.txt

Collecting dlib==19.24.6 (from -r requirements.txt (line 1))
  Downloading dlib-19.24.6.tar.gz (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting iou==0.1.0 (from -r requirements.txt (line 2))
  Downloading iou-0.1.0.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting lws==1.2.8 (from -r requirements.txt (line 4))
  Downloading lws-1.2.8.tar.gz (140 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting numpy==2.2.3 (from -r requirements.txt (line 5))
  Using cached numpy-2.2.3-cp310-cp310-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting python_speech_features==0.6 (from -r requirements.txt (line 8))
  Downloading python_speech_features-0.6.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
INFO: pip is looking at multiple versions of numba to determine which version is compatible wit

In [7]:
OPENAI_API_KEY=os.environ.get("OPENAI_API_KEY")

In [10]:
speech_text = """Hello, my name is Mina, an AI Engineer passionate about building intelligent automation solutions.
In this notebook, I worked on generating a video using different technologies. Here's how I did it:

First, I used a text-to-speech model to generate synthesized speech from either transcribed or manually written text.
I also used DALL·E to create an avatar of myself.

Then, I applied the Wav2Lip model to generate an AVI file, ensuring my avatar's lip movements were synchronized with the audio.

Finally, I combined the generated speech with visuals, aligning the audio with the processed images or video clips.
The result is a fully synchronized and visually engaging video.

Thank you for watching!"""


# Reading the text

In [11]:
import openai

openai.api_key = OPENAI_API_KEY

response = openai.audio.speech.create(
    model="tts-1",
    voice="nova",  # Try "alloy", "echo", "fable", "onyx", "nova", or "shimmer"
    input=speech_text
)

with open("audio_openai.mp3", "wb") as f:
    f.write(response.content)


In [5]:
!pip install gtts

Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gtts)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Installing collected packages: click, gtts
Successfully installed click-8.1.8 gtts-2.5.4


In [6]:
from gtts import gTTS

def generate_tts(text, output_path="output.mp3"):
    tts = gTTS(text=text, lang='en')
    tts.save(output_path)
    print(f"✅ TTS Audio Saved: {output_path}")

generate_tts(speech_text)


✅ TTS Audio Saved: output.mp3


# Generate an AI Avatar (Using DALL.E)

In [52]:
import openai
import requests
from PIL import Image
from io import BytesIO

# Set your OpenAI API key
openai.api_key = OPENAI_API_KEY

# Define the prompt for generating an AI avatar
prompt = "A professional AI-generated avatar of a young Iraniian woman ai-engineer with black and medium hair length and a nice smile, high quality, fully realistic, professional headshot, neutral background"

# Generate AI avatar using the latest OpenAI API
response = openai.images.generate(
    model="dall-e-3",  # Use "dall-e-3" for the best quality
    prompt=prompt,
    size="1024x1024",  # You can use "512x512" or "1024x1024"
    n=1
)

# Get the image URL from the response
image_url = response.data[0].url
print(f"✅ AI Avatar URL: {image_url}")

# Download and save the image
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))
img.save("ai_avatar_new.png")
print("✅ AI Avatar Saved: ai_avatar_new.png")


✅ AI Avatar URL: https://oaidalleapiprodscus.blob.core.windows.net/private/org-CKgDztnfYriS4kWl5F9IXwek/user-1hRsKaJsACFBgBIO3fEcabgu/img-lWnZvddYR94A1Q3cSQRCnJ4M.png?st=2025-02-20T02%3A07%3A39Z&se=2025-02-20T04%3A07%3A39Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=d505667d-d6c1-4a0a-bac7-5c84a87759f8&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-02-20T00%3A29%3A30Z&ske=2025-02-21T00%3A29%3A30Z&sks=b&skv=2024-08-04&sig=JQTgeqSw/xd6tkHRLCFtMRXcgK7WpUpBCyH7%2BjNO%2Bcs%3D
✅ AI Avatar Saved: ai_avatar.png


# Animate the Avatar's Lips with Wav2Lip

In [15]:
# !pip install numpy scipy opencv-python moviepy tqdm librosa ffmpeg-python
# !pip install torch torchvision torchaudio
# !pip install numpy==1.23.5

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp310-cp310-macosx_11_0_arm64.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.3
    Uninstalling numpy-2.1.3:
      Successfully uninstalled numpy-2.1.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
moviepy 2.1.2 requires numpy>=1.25.0, but you have numpy 1.23.5 which is incompatible.
numba 0.61.0 requires numpy<2.2,>=1.24, but you have numpy 1.23.5 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.23.5


## Convert MP3 to WAV

In [None]:
!ffmpeg -i audio_openai.mp3 -ar 16000 -ac 1 -c:a pcm_s16le audio_openai.wav

In [5]:
# !rm -f Wav2Lip/wav2lip_gan.pth

!git clone https://github.com/Rudrabha/Wav2Lip.git


!cd Wav2Lip && curl -L -o wav2lip_gan.pth "http://github.com/Rudrabha/Wav2Lip/releases/download/v1.0/wav2lip_gan.pth"


fatal: destination path 'Wav2Lip' already exists and is not an empty directory.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100     9  100     9    0     0     11      0 --:--:-- --:--:-- --:--:--     0


In [35]:
!ls Wav2Lip

README.md              [34mface_detection[m[m         preprocess.py
[34m__pycache__[m[m            [34mfilelists[m[m              requirements.txt
audio.py               hparams.py             [34mresults[m[m
[34mcheckpoints[m[m            hq_wav2lip_train.py    [34mtemp[m[m
color_syncnet_train.py inference.py           wav2lip_gan.pth
[34mevaluation[m[m             [34mmodels[m[m                 wav2lip_train.py


In [87]:
!cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face "../ai_avatar.png" --audio "../audio_openai.wav" --pads 0 20 0 0

Using cpu for inference.
Reading video frames...
Number of frames available for inference: 1
(80, 3437)
Length of mel chunks: 1071
  0%|                                                     | 0/9 [00:00<?, ?it/s]
  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.83s/it][A
Load checkpoint from: checkpoints/wav2lip_gan.pth
Model loaded
100%|█████████████████████████████████████████████| 9/9 [00:30<00:00,  3.37s/it]
ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --ena

# Merge Video & Voice

In [88]:
!ffmpeg -i  Wav2Lip/temp/result.avi -i audio_openai.wav -c:v libx264 -preset slow -crf 22 -c:a aac -b:a 192k -map 0:v:0 -map 1:a:0 final_ai_video2.mp4

ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e