<a href="https://colab.research.google.com/github/niceGuangjia/ez-wav2lip/blob/main/Easy_Wav2Lip_v8.3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Welcome to my Easy Wav2Lip colab!

My goal is to make lipsyncing with this tool easy, fast and great looking!

Please view the GitHub for instructions: [https://github.com/anothermartz/Easy-Wav2Lip](https://github.com/anothermartz/Easy-Wav2Lip?tab=readme-ov-file#best-practices)

In [None]:
version = 'v8.3'
#@title <h1>Step 1: Setup "Easy-Wav2Lip"</h1> With one button: it's really that easy!
#@markdown 👈 Click that little circle play button first - it will ask for Google Drive access: <br>
#@markdown > Accept if your files are on Google Drive (recommended).
#@markdown <br> Alternatively, you can click deny and upload files manually, but this is slower.

# check if already installed
import os
import sys
if os.path.exists('installed.txt'):
    with open('last_file.txt', 'r') as file:
        last_file = file.readline()
    if last_file == version:
        sys.exit('Easy-Wav2Lip '+version+' has already been run on this instance!')

# check GPU is enabled
print('checking for GPU')
import torch
if not torch.cuda.is_available():
    sys.exit('No GPU in runtime. Please go to the "Runtime" menu, "Change runtime type" and select "GPU".')

# prompt to mount google drive
print('requesting Google Drive access')
try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    print("google drive not linked")

# start timer
import time
start_time = time.time()

# clone git
giturl = 'https://github.com/anothermartz/Easy-Wav2Lip.git'
!git clone -b {version} {giturl}
%cd 'Easy-Wav2Lip'
working_directory = os.getcwd()
!mkdir 'face_alignment' 'temp'

# install prerequisites
print('installing batch_face')
import warnings
warnings.filterwarnings("ignore", category=UserWarning,
                        module='torchvision.transforms.functional_tensor')
!pip install batch_face --quiet
!pip install basicsr==1.4.2 --quiet

print('fixing basicsr degradations.py')
import site
site_packages = site.getsitepackages()[0]
!cp /content/Easy-Wav2Lip/degradations.py {site_packages}/basicsr/data/degradations.py

print('installing gfpgan')
!pip install gfpgan --quiet

!python install.py

from IPython.display import clear_output
clear_output()
print("Installation complete, move to Step 2!")

# end timer
elapsed_time = time.time() - start_time
from easy_functions import format_time
print(f"Execution time: {format_time(elapsed_time)}")


checking for GPU
requesting Google Drive access
Mounted at /content/drive
Cloning into 'Easy-Wav2Lip'...
remote: Enumerating objects: 1715, done.[K
remote: Counting objects: 100% (511/511), done.[K
remote: Compressing objects: 100% (200/200), done.[K
remote: Total 1715 (delta 455), reused 311 (delta 311), pack-reused 1204 (from 2)[K
Receiving objects: 100% (1715/1715), 20.30 MiB | 18.61 MiB/s, done.
Resolving deltas: 100% (839/839), done.
/content/Easy-Wav2Lip
installing batch_face
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m123.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m98.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━

In [13]:
# ====================== Step 2: Inputs & TTS（自动对齐时长） ======================

# 如果之前未安装 edge-tts，可去掉下面一行的注释以安装；否则可注释掉
!pip install -q edge-tts

import os
import sys
import subprocess
import configparser

# ====================== 检查 Step 1 是否已执行 ======================
if not os.path.exists('installed.txt'):
    sys.exit('Step 1 has not been run in this instance! Please run Step 1 each time you disconnect from a runtime.')

# ====================== 1. 用户输入区域 ======================
# 视频文件路径（请根据文件浏览器复制粘贴路径）
video_file = "/content/drive/MyDrive/video.mp4"  # @param {type:"string"}

# 如果有现成的音频文件，可填写路径；否则留空（结合 TTS 使用）
vocal_file = ""  # @param {type:"string"}

# 是否启用 TTS 功能？设为 True 时会根据下面的 tts_text 合成音频，并覆盖 vocal_file
use_tts = False  # @param {type:"boolean"}

# TTS 文本输入，仅当 use_tts=True 时有效
tts_text = "你好，这是通过 TTS 生成并对齐时长的示例语音。"  # @param {type:"string"}

# ====================== 2. 质量与分辨率选项 ======================
quality = "Enhanced"         # @param ["Fast", "Improved", "Enhanced"]
output_height = "full resolution"  # @param ["half resolution", "full resolution", "480"] {allow-input: true}
use_previous_tracking_data = True   # @param {type:"boolean"}

# ====================== 3. Easy-Wav2Lip 参数设置 ======================
wav2lip_version = "Wav2Lip"  # @param ["Wav2Lip", "Wav2Lip_GAN"]
nosmooth = False            # @param {type:"boolean"}

# Padding (Up, Down, Left, Right)
U = 10  # @param {type:"slider", min:-100, max:100, step:1}
D = 10  # @param {type:"slider", min:-100, max:100, step:1}
L = 0   # @param {type:"slider", min:-100, max:100, step:1}
R = 0   # @param {type:"slider", min:-100, max:100, step:1}

# Mask 设置
size = 1.3         # @param {type:"slider", min:1, max:6, step:0.1}
feathering = 1     # @param {type:"slider", min:0, max:3, step:1}
mouth_tracking = True   # @param {type:"boolean"}
debug_mask = False      # @param {type:"boolean"}

# 其他选项
batch_process = False            # @param {type:"boolean"}
output_suffix = "_Easy-Wav2Lip"  # @param {type:"string"}
include_settings_in_suffix = False  # @param {type:"boolean"}
preview_input = False   # @param {type:"boolean"}
preview_settings = False  # @param {type:"boolean"}
frame_to_preview = 100     # @param {type:"integer"}

# ========================================================================

# ====================== 4. 定义辅助函数：获取音视频时长 ======================

def get_media_duration(path):
    """
    调用 ffprobe 获取多媒体文件时长（秒），返回浮点数。
    """
    try:
        # -v error: 只打印错误
        # -show_entries format=duration: 只返回 duration 条目
        # -of default=noprint_wrappers=1:nokey=1: 只打印数值
        result = subprocess.run(
            ['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
             '-of', 'default=noprint_wrappers=1:nokey=1', path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        duration_str = result.stdout.decode().strip()
        return float(duration_str)
    except Exception as e:
        print(f"❌ 获取时长失败: {e}")
        return None

# ====================== 5. 如果启用 TTS，则先合成“临时”音频并对齐时长 ======================

if use_tts and tts_text.strip():
    print("🔊 开始 TTS 合成（初始）……")
    import asyncio
    import edge_tts  # 确保已安装

    temp_tts_path = "temp_tts.wav"       # 初步合成文件
    final_tts_path = "tts_output.wav"    # 对齐后输出文件

    async def generate_tts(text, save_path):
        communicator = edge_tts.Communicate(text, "zh-CN-XiaoxiaoNeural")
        await communicator.save(save_path)

    # 5.1 生成初始 TTS 音频
    try:
        asyncio.run(generate_tts(tts_text, temp_tts_path))
        print(f"✅ 初始 TTS 合成完成，已保存到：{temp_tts_path}")
    except Exception as e:
        print("❌ TTS 合成失败：", e)
        sys.exit("请检查网络或 TTS 配置。")

    # 5.2 获取 video 和 temp_tts 音频各自时长
    print("⏱️ 正在获取视频与音频时长……")
    video_dur = get_media_duration(video_file)
    tts_dur = get_media_duration(temp_tts_path)

    if video_dur is None or tts_dur is None:
        sys.exit("❌ 无法获取时长，终止。")

    print(f"   ▶️ 视频时长: {video_dur:.2f} 秒")
    print(f"   ▶️ 初始 TTS 时长: {tts_dur:.2f} 秒")

    # 5.3 计算 atempo 因子 = video_dur / tts_dur
    factor = video_dur / tts_dur
    # ffmpeg atempo 最多支持 0.5 到 2.0 之间的速度
    # 如果 factor >2 或 <0.5，需要拆分成多个 atempo 过滤链
    # 例如 factor = 4 -> atempo=2.0,atempo=2.0
    # factor = 0.25 -> atempo=0.5,atempo=0.5
    if factor <= 0:
        sys.exit("❌ 计算 atempo 因子无效。")
    # 将因子限制在 [0.25, 4.0] 之间，以便拆分
    # 这里简单实现：不断拆分 factor，直到所有子因子都在 [0.5,2.0]
    atempo_chain = []
    f = factor
    while f < 0.5 or f > 2.0:
        if f > 2.0:
            atempo_chain.append(2.0)
            f /= 2.0
        elif f < 0.5:
            atempo_chain.append(0.5)
            f /= 0.5
    atempo_chain.append(round(f, 5))  # 最后一个子因子
    # 用逗号连接
    atempo_filter = ",".join([f"atempo={x}" for x in atempo_chain])
    print(f"   ▶️ 计算 atempo 过滤链：{atempo_filter}")

    # 5.4 调用 ffmpeg 对 temp_tts.wav 做时间拉伸/压缩 -> final_tts_path
    cmd = [
        "ffmpeg", "-y", "-i", temp_tts_path,
        "-filter:a", atempo_filter,
        final_tts_path
    ]
    print(f"▶️ 开始音频时长对齐，请稍候……")
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if result.returncode != 0:
        print("❌ ffmpeg 处理失败：", result.stderr.decode())
        sys.exit("TTS 时长对齐失败。")

    # 5.5 最终输出的 tts_output.wav 时长（可选再验证）
    final_dur = get_media_duration(final_tts_path)
    print(f"✅ 对齐完毕，最终 TTS 时长: {final_dur:.2f} 秒 （目标: {video_dur:.2f} 秒）")

    # 将 vocal_file 设置为对齐后的音频
    vocal_file = final_tts_path

else:
    if vocal_file.strip() == "":
        print("⚠️ 未填写已有音频路径，且未启用 TTS，将使用视频自带音轨（如果有）。")
    else:
        print(f"🔊 使用用户指定的音频文件：{vocal_file}")

# ====================== 6. 将所有参数写入 config.ini ======================
config = configparser.ConfigParser()
options = {
    'video_file': video_file,
    'vocal_file': vocal_file,
    'quality': quality,
    'output_height': output_height,
    'wav2lip_version': wav2lip_version,
    'use_previous_tracking_data': use_previous_tracking_data,
    'nosmooth': nosmooth
}
padding = {
    'U': U,
    'D': D,
    'L': L,
    'R': R
}
mask = {
    'size': size,
    'feathering': feathering,
    'mouth_tracking': mouth_tracking,
    'debug_mask': debug_mask
}
other = {
    'batch_process': batch_process,
    'output_suffix': output_suffix,
    'include_settings_in_suffix': include_settings_in_suffix,
    'preview_input': preview_input,
    'preview_settings': preview_settings,
    'frame_to_preview': frame_to_preview
}

config['OPTIONS'] = options
config['PADDING'] = padding
config['MASK'] = mask
config['OTHER'] = other

with open('config.ini', 'w') as f:
    config.write(f)
print("✅ 已生成 config.ini 配置文件。")

# ====================== 7. 调用 Easy-Wav2Lip 进行唇形合成 ======================
print("▶️ 开始调用 Easy-Wav2Lip 进行合成……")
!python run.py

# ====================== 8. 预览或展示结果 ======================
from easy_functions import show_video
from IPython.display import Image, display

if preview_settings:
    # 仅预览指定帧
    if os.path.isfile(os.path.join('temp', 'preview.jpg')):
        display(Image(os.path.join('temp', 'preview.jpg')))
    else:
        print("⚠️ 未找到预览图，请检查 preview_settings 是否正确。")
else:
    # 预览完整合成视频
    if os.path.isfile(os.path.join('temp', 'output.mp4')):
        print("✅ 合成完成，正在加载视频预览……")
        show_video(os.path.join('temp', 'output.mp4'))
    else:
        print("⚠️  没有找到输出视频，请检查 run.py 是否执行成功。")


Processing video.mp4 using audio.wav for audio
imports loaded!     
analysing audio...
800 frames to process
detecting face in every frame: 100%|██████████████████████████████| 800/800 [00:09<00:00, 80.26it/s]
mask size: 1.3, feathering: 1
Loading gfpgan
Starting...
Processing Wav2Lip: 100%|█████████████████████████████████████████| 809/809 [05:12<00:00,  2.59it/s]
converting to final video
video_audio successfully lip synced! It will be found here:
/content/drive/MyDrive/video_audio_Easy-Wav2Lip.mp4
Execution time: 6m 0s
Loading video preview...


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
