In [1]:
from os import listdir, path
import numpy as np
import scipy, cv2, os, sys, argparse, audio
import json, subprocess, random, string
from tqdm import tqdm
from glob import glob
import torch, face_detection
from models import Wav2Lip
import platform
from moviepy.editor import VideoFileClip, concatenate_videoclips, AudioFileClip, ImageSequenceClip, CompositeAudioClip
# hd after
""" from basicsr.archs.rrdbnet_arch import RRDBNet
from gfpgan import GFPGANer
from realesrgan import RealESRGANer """




EXTERNAL = False
ANIME = True

if EXTERNAL:
    args = {
        'checkpoint_path': '.\\lip\\checkpoints\\wav2lip_gan.pth',
        'face':'.\\result2.mp4',       
        'temp_folder': '.\\lip\\temp\\result.mp4',
        'static': False,
        'fps': 30.0,
        'pads':[0, 10, 0, 0],
        'face_det_batch_size':16,
        'wav2lip_batch_size':128,
        'resize_factor':1,
        'crop':[0, -1, 0, -1],
        'box':[-1, -1, -1, -1],
        'rotate': False,
        'nosmooth':False,
        'img_size': 96 
    }
    %store -r WIDTH
    %store -r HEIGHT
    %store -r TEXT
    %store -r VISUALIZE
    %store -r SAVE_FOLDER
    %store -r PROJECT_NAME
    %store -r SHOW_OUTPUT
    %store -r IMG_NUMBER
    %store -r THUMBNAIL
    %store -r SKIP_VIDEO
    %store -r MERGE_VIDEO
    %store -r TEXT_VOICE_GEN
    %store -r LANGUAGES
    %store -r GEN_VIDEO
    %store -r INSERT_BACKGROUND
    %store -r RECORD_FRAME_INTER_AFTER
else:


    args = {
        'checkpoint_path': '.\\lip\\checkpoints\\wav2lip_gan.pth',        
        #'face':'.\\lip\\input_video.mp4',
        'face':'.\\result2.mp4',
        'audio':'.\\lip\\input_audio.wav',
        'outfile': '.\\final.mp4',
        'temp_folder': '.\\lip\\temp\\result.mp4',
        'static': False,
        'fps': 30.0,
        'pads':[0, 10, 0, 0],
        'face_det_batch_size':16,
        'wav2lip_batch_size':128,
        'resize_factor':1,
        'crop':[0, -1, 0, -1],        
        'box':[-1, -1, -1, -1],
        'rotate': False,
        'nosmooth':False,
        'img_size': 96 
    }
    WIDTH = 768
    HEIGHT = 768
    VISUALIZE = True  
    SAVE_FOLDER = "D:\\Deletar\\p_gen"
    PROJECT_NAME = "THE FELLOWSHIP OF THE RING"
    STYLES_FOLDER = ".\\styles"
    SHOW_OUTPUT = True
    IMG_NUMBER = 2
    SKIP_VIDEO = []
    MERGE_VIDEO = False
    TEXT_VOICE_GEN = ['A','A','A','A','A','A','A','A','A','A','A','A','A','A','A','A','A','A','A']
    LANGUAGES = ['pt']
    GEN_VIDEO = True
    INSERT_BACKGROUND = False
    RECORD_FRAME_INTER_AFTER = 0
    RECORD_FRAME_LIP = -1


mel_step_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} for inference.'.format(device))


project_folder = f"{SAVE_FOLDER}//{PROJECT_NAME}"


Using cuda for inference.


In [2]:
def get_smoothened_boxes(boxes, T):
	for i in range(len(boxes)):
		if i + T > len(boxes):
			window = boxes[len(boxes) - T:]
		else:
			window = boxes[i : i + T]
		boxes[i] = np.mean(window, axis=0)
	return boxes

def face_detect(images):
	detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D, 
											flip_input=False, device=device)

	batch_size = args['face_det_batch_size']
	
	while 1:
		predictions = []
		try:
			for i in tqdm(range(0, len(images), batch_size)):
				predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
		except RuntimeError:
			if batch_size == 1: 
				raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
			batch_size //= 2
			print('Recovering from OOM error; New batch size: {}'.format(batch_size))
			continue
		break

	results = []
	pady1, pady2, padx1, padx2 = args['pads']
	for rect, image in zip(predictions, images):
		if rect is None:
			cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
			raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')

		y1 = max(0, rect[1] - pady1)
		y2 = min(image.shape[0], rect[3] + pady2)
		x1 = max(0, rect[0] - padx1)
		x2 = min(image.shape[1], rect[2] + padx2)
		
		results.append([x1, y1, x2, y2])

	boxes = np.array(results)
	if not args['nosmooth']: boxes = get_smoothened_boxes(boxes, T=5)
	results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]

	del detector
	return results 

def datagen(frames, mels):
	img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []

	if args['box'][0] == -1:
		if not args['static']:
			face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
		else:
			face_det_results = face_detect([frames[0]])
	else:
		print('Using the specified bounding box instead of face detection...')
		y1, y2, x1, x2 = args['box']
		face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]

	for i, m in enumerate(mels):
		idx = 0 if args['static'] else i%len(frames)
		frame_to_save = frames[idx].copy()
		face, coords = face_det_results[idx].copy()

		face = cv2.resize(face, (args['img_size'], args['img_size']))
			
		img_batch.append(face)
		mel_batch.append(m)
		frame_batch.append(frame_to_save)
		coords_batch.append(coords)

		if len(img_batch) >= args['wav2lip_batch_size']:
			img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)

			img_masked = img_batch.copy()
			img_masked[:, args['img_size']//2:] = 0

			img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
			mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

			yield img_batch, mel_batch, frame_batch, coords_batch
			img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []

	if len(img_batch) > 0:
		img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)

		img_masked = img_batch.copy()
		img_masked[:, args['img_size']//2:] = 0

		img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
		mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

		yield img_batch, mel_batch, frame_batch, coords_batch


def _load(checkpoint_path):
	if device == 'cuda':
		checkpoint = torch.load(checkpoint_path)
	else:
		checkpoint = torch.load(checkpoint_path,
								map_location=lambda storage, loc: storage)
	return checkpoint

def load_model(path):
	model = Wav2Lip()
	print("Load checkpoint from: {}".format(path))
	checkpoint = _load(path)
	s = checkpoint["state_dict"]
	new_s = {}
	for k, v in s.items():
		new_s[k.replace('module.', '')] = v
	model.load_state_dict(new_s)

	model = model.to(device)
	return model.eval()

model = load_model(args['checkpoint_path'])

project_folder = f"{SAVE_FOLDER}//{PROJECT_NAME}"

""" netscale = 4
model_path_face = "./realesrgan/GFPGANv1.3.pth"
dni_weight = None
tile = 0
title_pad = 10
pre_pad = 0
half = None
gpu_id = None
upscale = 3.5
upscale = 4
img_mode = 'RGBA'

if ANIME:
  # R-ESRGAN + Anime
  model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
  model_path_x4 = "./realesrgan/RealESRGAN_x4plus_anime_6B.pth"
else:
  # R-ESRGAN
  model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
  model_path_x4 = "./realesrgan/RealESRGAN_x4plus.pth"

upsampler = RealESRGANer(
        scale=netscale,
        model_path=model_path_x4,
        dni_weight=dni_weight,
        model=model,
        tile=tile,
        tile_pad=title_pad,
        pre_pad=pre_pad,
        half= not half,
        gpu_id= gpu_id
)

face_enhancer = GFPGANer(
            model_path=model_path_face,
            upscale=upscale,
            arch='clean',
            channel_multiplier=2,
            bg_upsampler=upsampler) """


Load checkpoint from: .\checkpoints\wav2lip_gan.pth


' netscale = 4\nmodel_path_face = "./realesrgan/GFPGANv1.3.pth"\ndni_weight = None\ntile = 0\ntitle_pad = 10\npre_pad = 0\nhalf = None\ngpu_id = None\nupscale = 3.5\nupscale = 4\nimg_mode = \'RGBA\'\n\nif ANIME:\n  # R-ESRGAN + Anime\n  model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)\n  model_path_x4 = "./realesrgan/RealESRGAN_x4plus_anime_6B.pth"\nelse:\n  # R-ESRGAN\n  model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)\n  model_path_x4 = "./realesrgan/RealESRGAN_x4plus.pth"\n\nupsampler = RealESRGANer(\n        scale=netscale,\n        model_path=model_path_x4,\n        dni_weight=dni_weight,\n        model=model,\n        tile=tile,\n        tile_pad=title_pad,\n        pre_pad=pre_pad,\n        half= not half,\n        gpu_id= gpu_id\n)\n\nface_enhancer = GFPGANer(\n            model_path=model_path_face,\n            upscale=upscale,\n            arch=\'clean\',\n            channel_multiplie

In [3]:
def lip_sync(video_location, audio_location, output_folder):
	args['face'] = video_location
	args['audio'] = audio_location


	if os.path.isfile(args['face']) and args['face'].split('.')[1] in ['jpg', 'png', 'jpeg']:
		args['static'] = True

	if not os.path.isfile(args['face']):
		raise ValueError('--face argument must be a valid path to video/image file')

	elif args['face'].split('.')[1] in ['jpg', 'png', 'jpeg']:
		full_frames = [cv2.imread(args['face'])]
		fps = args['fps']

	else:
		video_stream = cv2.VideoCapture(args['face'])
		fps = video_stream.get(cv2.CAP_PROP_FPS)

		print('Reading video frames...')

		full_frames = []
		while 1:
			still_reading, frame = video_stream.read()
			if not still_reading:
				video_stream.release()
				break
			if args['resize_factor'] > 1:
				frame = cv2.resize(frame, (frame.shape[1]//args['resize_factor'], frame.shape[0]//args['resize_factor']))

			if args['rotate']:
				frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)

			y1, y2, x1, x2 = args['crop']
			if x2 == -1: x2 = frame.shape[1]
			if y2 == -1: y2 = frame.shape[0]

			frame = frame[y1:y2, x1:x2]

			full_frames.append(frame)

	print ("Number of frames available for inference: "+str(len(full_frames)))

	if not args['audio'].endswith('.wav'):
		print('Extracting raw audio...')
		command = 'ffmpeg -y -i {} -strict -2 {}'.format(args['audio'], 'temp/temp.wav')

		subprocess.call(command, shell=True)
		args['audio'] = '.\\lip\\temp\\temp.wav'

	wav = audio.load_wav(args['audio'], 16000)
	mel = audio.melspectrogram(wav)
	print(mel.shape)

	if np.isnan(mel.reshape(-1)).sum() > 0:
		raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')

	mel_chunks = []
	mel_idx_multiplier = 80./fps 
	i = 0
	while 1:
		start_idx = int(i * mel_idx_multiplier)
		if start_idx + mel_step_size > len(mel[0]):
			mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
			break
		mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
		i += 1

	print("Length of mel chunks: {}".format(len(mel_chunks)))

	full_frames = full_frames[:len(mel_chunks)]

	batch_size = args['wav2lip_batch_size']
	gen = datagen(full_frames.copy(), mel_chunks)

	for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, 
											total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
		if i == 0:
			

			frame_h, frame_w = full_frames[0].shape[:-1]
			""" out = cv2.VideoWriter('temp/result.avi', 
									cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h)) """
			out = cv2.VideoWriter(args['temp_folder'], 
									cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))

		img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
		mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)

		with torch.no_grad():
			pred = model(mel_batch, img_batch)

		pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
		
		for p, f, c in zip(pred, frames, coords):
			y1, y2, x1, x2 = c
			p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))

			f[y1:y2, x1:x2] = p
			out.write(f)

	out.release()

	audio1 = AudioFileClip(args['audio'])
	video = VideoFileClip(args['temp_folder'])   
	final_video = video.set_audio(audio1)
	if os.path.exists(output_folder):
		os.remove(output_folder) 
	final_video.write_videofile(output_folder)





In [4]:
if EXTERNAL:
       
    video = []
    cont = -1
    start_time = 0
    acc_time = 0
    video_face_lq = VideoFileClip(args['face'])    
    if (not os.path.exists(f"{project_folder}//lip")):
                    os.makedirs(f"{project_folder}//lip")
    for folder in os.listdir(f"{project_folder}//video_sound_hd"):
        for filename in os.listdir(f"{project_folder}//video_sound_hd//{folder}"):
            if (filename[-3:] == 'mp4'):
                cont = cont + 1
                addr_video = f"{project_folder}//video_sound_hd//{folder}//{filename}"
                video_hq = VideoFileClip(addr_video)                    
                original_audio = video_hq.audio
                original_audio_duration = round(original_audio.duration,3)

                if (RECORD_FRAME_LIP < cont):
                    print(filename[:-4])                    
                    if (not os.path.exists(f"{project_folder}//lip//{filename[:-4]}")):
                        os.makedirs(f"{project_folder}//lip//{filename[:-4]}")
                    subclip_face = video_face_lq.subclip(acc_time, acc_time + original_audio_duration)
                    temp_video_face_file_address = f'.\\lip\\temp\\{filename[:-4]}.mp4'                    
                    """ if os.path.exists(temp_video_face_file_address):
                        os.remove(temp_video_face_file_address)  """                                   
                    temp_audio_file_address = f'.\\lip\\temp\\{filename[:-4]}.wav'
                    """ if os.path.exists(temp_audio_file_address):
                        os.remove(temp_audio_file_address) """
                    subclip_face.write_videofile(temp_video_face_file_address) 
                    original_audio.write_audiofile(temp_audio_file_address)
                    if os.path.exists(f"{project_folder}//lip//{filename[:-4]}//{filename[:-4]}.mp4"):
                        os.remove(f"{project_folder}//lip//{filename[:-4]}//{filename[:-4]}.mp4")
                    print(f"{project_folder}//lip//{filename[:-4]}//{filename[:-4]}.mp4")
                    print(temp_audio_file_address)
                    print(temp_video_face_file_address)
                    lip_sync(temp_video_face_file_address, temp_audio_file_address, f"{project_folder}//lip//{filename[:-4]}//{filename[:-4]}.mp4")
                    os.remove(temp_video_face_file_address)
                    os.remove(temp_audio_file_address)                 
                acc_time = round(acc_time + original_audio_duration,3)
    """ if (not os.path.exists(f"{project_folder}//lip_hd")):
                    os.makedirs(f"{project_folder}//lip_hd")
    for folder in os.listdir(f"{project_folder}//lip"):
        for filename in os.listdir(f"{project_folder}//lip//{folder}"):                   
            if (filename[-3:] == 'mp4'):
                frames_hd = []
                addr_video = f"{project_folder}//lip//{folder}//{filename}"                
                video_lq_lip = VideoFileClip(addr_video)  
                audio_lq_lip = video_lq_lip.audio        
                if (not os.path.exists(f"{project_folder}//lip_hd//{folder}")):
                    os.makedirs(f"{project_folder}//lip_hd//{folder}")           
                for i, frame in enumerate(video_lq_lip.iter_frames()):                    
                    _, _, output = face_enhancer.enhance(frame, has_aligned=False, only_center_face=False, paste_back=True)
                    frames_hd.append(output)
                if (not os.path.exists(f"{project_folder}//lip_hd//{filename[:-4]}")):
                    os.makedirs(f"{project_folder}//lip_hd//{filename[:-4]}")
                if os.path.exists(f"{project_folder}//lip_hd//{filename[:-4]}//{filename[:-4]}.mp4"):
                        os.remove(f"{project_folder}//lip_hd//{filename[:-4]}//{filename[:-4]}.mp4")   
                video_hd_lip = ImageSequenceClip(frames_hd, fps=30)
                video_hd_lip = video_hd_lip.set_audio(audio_lq_lip)                
                video_hd_lip.write_videofile(f"{project_folder}//lip_hd//{filename[:-4]}//{filename[:-4]}.mp4")
    for folder in os.listdir(f"{project_folder}//lip_hd"):
        for filename in os.listdir(f"{project_folder}//lip_hd//{folder}"):
             if (filename[-3:] == 'mp4'):
                  temp_lip_video = VideoFileClip(f"{project_folder}//lip_hd//{folder}//{filename}")
                  video.append(temp_lip_video)
    merged_video = concatenate_videoclips(video, method='chain')
    if os.path.exists(f"{project_folder}//full//full_lip.mp4"):
        os.remove(f"{project_folder}//full//full_lip.mp4")
    merged_video.write_videofile(f"{project_folder}//full//full_lip.mp4") """





else:
     lip_sync(args['face'], args['audio'], args['outfile'])

Reading video frames...
Number of frames available for inference: 751


  return f(*args, **kwargs)


FileNotFoundError: [Errno 2] No such file or directory: '.\\lip\\input_audio.wav'