In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
import os
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import cmudict
from PIL import Image
import cv2

# Download CMUdict if not already
nltk.download('cmudict')
cmu_dict = cmudict.dict()

# Phoneme to viseme class mapping
phoneme_to_mouth_shape = {
    # Closed lips
    "b": "01_Closed_Lips",
    "p": "01_Closed_Lips",
    "m": "01_Closed_Lips",

    # Teeth touching
    "f": "02_Teeth_Touching",
    "v": "02_Teeth_Touching",
    "th": "02_Teeth_Touching",
    "dh": "02_Teeth_Touching",

    # Open mouth vowels
    "aa": "03_Open_Mouth",
    "ae": "03_Open_Mouth",
    "ah": "03_Open_Mouth",
    "eh": "03_Open_Mouth",
    "ih": "03_Open_Mouth",
    "iy": "03_Open_Mouth",
    "er": "03_Open_Mouth",
    "ey": "03_Open_Mouth",
    "g": "03_Open_Mouth",
    "k": "03_Open_Mouth",
    "uh": "03_Open_Mouth",
    "uw": "03_Open_Mouth",
    "hh": "03_Open_Mouth",

    # Rounded lips vowels and glides
    "aw": "04_Rounded_Lips",
    "ow": "04_Rounded_Lips",
    "oy": "04_Rounded_Lips",
    "w": "04_Rounded_Lips",

    # Tongue behind teeth
    "t": "05_Tongue_Behind_Teeth",
    "d": "05_Tongue_Behind_Teeth",
    "n": "05_Tongue_Behind_Teeth",
    "s": "05_Tongue_Behind_Teeth",
    "z": "05_Tongue_Behind_Teeth",

    # Retroflex sounds
    "r": "06_Retroflex",
    "jh": "06_Retroflex",

    # Fricatives and sibilants
    "sh": "07_Fricative_Sibilant",
    "zh": "07_Fricative_Sibilant",
    "ch": "07_Fricative_Sibilant",

    # Nasal (back)
    "ng": "08_Nasal",

    # Lateral
    "l": "09_Lateral",

    # Semi-vowels (glides)
    "y": "10_Semi_Vowel",

    # Diphthongs and others (some overlapping)
    "ay": "03_Open_Mouth",
}

# --- Helpers ---

def get_phonemes_from_cmudict(word):
    word = word.lower()
    if word not in cmu_dict:
        raise ValueError(f"No phonemes found for word '{word}'")
    phonemes = cmu_dict[word][0]
    return [p.lower().strip("0123456789") for p in phonemes]

def load_gan_model(viseme_class_path, epoch=100):
    model_path = f"{viseme_class_path}/generator_epoch_{epoch}.model.keras"
    return tf.keras.models.load_model(model_path)

def generate_lip_frames(generator, latent_dim=100):
    z = np.random.normal(0, 1, (1, latent_dim))
    return generator.predict(z)

def save_as_png(image_array, save_path):
    image_array = np.clip(image_array, 0, 1)
    image_array = (image_array * 255).astype(np.uint8)
    img = Image.fromarray(image_array)
    img.save(save_path)

def create_gif_from_frames(folder_path, output_path="output.gif", duration=500):
    frame_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".png")])
    frames = [Image.open(os.path.join(folder_path, f)) for f in frame_files]

    if frames:
        frames[0].save(
            output_path,
            format="GIF",
            save_all=True,
            append_images=frames[1:],
            duration=duration,
            loop=0
        )
        print(f"✅ GIF saved to: {output_path}")
    else:
        print("⚠️ No PNG files found to create GIF.")

def create_mp4_from_frames(folder_path, output_path="output.mp4", fps=1):
    frame_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".png")])
    if not frame_files:
        print("⚠️ No PNG files found to create MP4.")
        return

    first_frame = cv2.imread(os.path.join(folder_path, frame_files[0]))
    height, width, _ = first_frame.shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    for file_name in frame_files:
        frame = cv2.imread(os.path.join(folder_path, file_name))
        video.write(frame)

    video.release()
    print(f"✅ MP4 video saved to: {output_path}")

# --- Phoneme Duration Prediction ---

def predict_phoneme_durations(phonemes, base_duration=0.1):
    """
    Predict durations for each phoneme.
    This is a simple rule-based predictor assigning a base duration to each phoneme.
    """
    durations = []
    for phoneme in phonemes:
        # Assign longer duration to vowels
        if phoneme in ['aa', 'ae', 'ah', 'eh', 'ih', 'iy', 'er', 'ey', 'uh', 'uw', 'aw', 'ow', 'oy', 'ay']:
            durations.append(base_duration * 1.5)
        else:
            durations.append(base_duration)
    return durations

# --- Main generation logic ---

def generate_from_word(word, base_model_dir, save_dir, fps=25):
    phonemes = get_phonemes_from_cmudict(word)
    print(f"Phonemes: {phonemes}")

    durations = predict_phoneme_durations(phonemes)
    print(f"Predicted durations: {durations}")

    output_path = os.path.join(save_dir, word)
    os.makedirs(output_path, exist_ok=True)

    frame_idx = 0

    for phoneme, duration in zip(phonemes, durations):
        viseme_class = phoneme_to_mouth_shape.get(phoneme)
        if not viseme_class:
            print(f"Skipping unknown phoneme: {phoneme}")
            continue

        generator_path = os.path.join(base_model_dir, viseme_class)
        if not os.path.exists(generator_path):
            print(f"GAN model for {viseme_class} not found, skipping.")
            continue

        generator = load_gan_model(generator_path)

        # Calculate number of frames for this phoneme based on duration and fps
        num_frames = max(1, int(duration * fps))

        for _ in range(num_frames):
            generated_clip = generate_lip_frames(generator)

            if generated_clip.ndim == 5:
                frame = generated_clip[0, 0]
            elif generated_clip.ndim == 4:
                frame = generated_clip[0]
            else:
                frame = generated_clip

            save_path = os.path.join(output_path, f"frame_{frame_idx:03d}.png")
            save_as_png(frame, save_path)
            frame_idx += 1

    print(f"✅ PNG frames saved to: {output_path}")

    # Create GIF and MP4
    gif_path = os.path.join(save_dir, f"{word}.gif")
    mp4_path = os.path.join(save_dir, f"{word}.mp4")
    create_gif_from_frames(output_path, gif_path, duration=int(1000 / fps))
    create_mp4_from_frames(output_path, mp4_path, fps=fps)

    return output_path



[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [15]:
# --- Run ---

if __name__ == "__main__":
    word = input("Enter a word: ").strip()
    base_model_dir = "/content/drive/MyDrive/All_outputs/gans_allfolders"
    save_dir = "/content/drive/MyDrive/All_outputs/mergeGANS"
    generate_from_word(word, base_model_dir, save_dir)


Enter a word: read
Phonemes: ['r', 'eh', 'd']
Predicted durations: [0.1, 0.15000000000000002, 0.1]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 337ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
✅ PNG frames saved to: /content/drive/MyDrive/All_outputs/mergeGANS/read
✅ GIF saved to: /content/drive/MyDrive/All_outputs/mergeGANS/read.gif
✅ MP4 video saved to: /content/drive/MyDrive/All_outputs/mergeGANS/read.mp4


#FRONTEND

In [16]:
!pip install gradio --quiet


In [17]:
import gradio as gr
import os  # Just to be sure it's included

def frontend_generate(word):
    base_model_dir = "/content/drive/MyDrive/All_outputs/gans_allfolders"
    save_dir = "/content/drive/MyDrive/All_outputs/mergeGANS"

    try:
        output_path = generate_from_word(word, base_model_dir, save_dir)

        # Get all frame file paths sorted by filename
        frame_paths = sorted([
            os.path.join(output_path, fname)
            for fname in os.listdir(output_path)
            if fname.endswith(".png")
        ])

        return frame_paths
    except Exception as e:
        # Return an empty list and optionally print the error
        print(f"❌ Error: {e}")
        return []



In [18]:

interface = gr.Interface(
    fn=frontend_generate,
    inputs=gr.Textbox(label="Enter a word"),
    outputs=gr.Gallery(label="Generated Lip Frames", columns=5, height="auto"),
    title="Viseme GAN Generator",
    description="Enter a word to generate lip frame sequence using viseme-specific GANs."
)

In [19]:
interface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e5d633435a36bbcab5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




_________________________________________

#WITH INPUT IMAGE

In [None]:
#install media pipe
!pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.7-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl (35.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading protobuf-4.25.7-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.1-py3-none-any.whl (32 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.4
    Uninstalling protobuf-5.29.4:
      

In [None]:
import os
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import cmudict
from PIL import Image
import cv2

# Download CMUdict if not already
nltk.download('cmudict')
cmu_dict = cmudict.dict()

# Phoneme to viseme class mapping
phoneme_to_mouth_shape = {
    # Closed lips
    "b": "01_Closed_Lips",
    "p": "01_Closed_Lips",
    "m": "01_Closed_Lips",

    # Teeth touching
    "f": "02_Teeth_Touching",
    "v": "02_Teeth_Touching",
    "th": "02_Teeth_Touching",
    "dh": "02_Teeth_Touching",

    # Open mouth vowels
    "aa": "03_Open_Mouth",
    "ae": "03_Open_Mouth",
    "ah": "03_Open_Mouth",
    "eh": "03_Open_Mouth",
    "ih": "03_Open_Mouth",
    "iy": "03_Open_Mouth",
    "er": "03_Open_Mouth",
    "ey": "03_Open_Mouth",
    "g": "03_Open_Mouth",
    "k": "03_Open_Mouth",
    "uh": "03_Open_Mouth",
    "uw": "03_Open_Mouth",
    "hh": "03_Open_Mouth",

    # Rounded lips vowels and glides
    "aw": "04_Rounded_Lips",
    "ow": "04_Rounded_Lips",
    "oy": "04_Rounded_Lips",
    "w": "04_Rounded_Lips",

    # Tongue behind teeth
    "t": "05_Tongue_Behind_Teeth",
    "d": "05_Tongue_Behind_Teeth",
    "n": "05_Tongue_Behind_Teeth",
    "s": "05_Tongue_Behind_Teeth",
    "z": "05_Tongue_Behind_Teeth",

    # Retroflex sounds
    "r": "06_Retroflex",
    "jh": "06_Retroflex",

    # Fricatives and sibilants
    "sh": "07_Fricative_Sibilant",
    "zh": "07_Fricative_Sibilant",
    "ch": "07_Fricative_Sibilant",

    # Nasal (back)
    "ng": "08_Nasal",

    # Lateral
    "l": "09_Lateral",

    # Semi-vowels (glides)
    "y": "10_Semi_Vowel",

    # Diphthongs and others (some overlapping)
    "ay": "03_Open_Mouth",
}

# --- Helpers ---

def get_phonemes_from_cmudict(word):
    word = word.lower()
    if word not in cmu_dict:
        raise ValueError(f"No phonemes found for word '{word}'")
    phonemes = cmu_dict[word][0]
    return [p.lower().strip("0123456789") for p in phonemes]

def load_gan_model(viseme_class_path, epoch=100):
    model_path = f"{viseme_class_path}/generator_epoch_{epoch}.model.keras"
    return tf.keras.models.load_model(model_path)

def generate_lip_frames(generator, latent_dim=100):
    z = np.random.normal(0, 1, (1, latent_dim))
    return generator.predict(z)

def save_as_png(image_array, save_path):
    image_array = np.clip(image_array, 0, 1)
    image_array = (image_array * 255).astype(np.uint8)
    img = Image.fromarray(image_array)
    img.save(save_path)

def create_gif_from_frames(folder_path, output_path="output.gif", duration=500):
    frame_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".png")])
    frames = [Image.open(os.path.join(folder_path, f)) for f in frame_files]

    if frames:
        frames[0].save(
            output_path,
            format="GIF",
            save_all=True,
            append_images=frames[1:],
            duration=duration,
            loop=0
        )
        print(f"✅ GIF saved to: {output_path}")
    else:
        print("⚠️ No PNG files found to create GIF.")

def create_mp4_from_frames(folder_path, output_path="output.mp4", fps=2):
    frame_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".png")])
    if not frame_files:
        print("⚠️ No PNG files found to create MP4.")
        return

    first_frame = cv2.imread(os.path.join(folder_path, frame_files[0]))
    height, width, _ = first_frame.shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    for file_name in frame_files:
        frame = cv2.imread(os.path.join(folder_path, file_name))
        video.write(frame)

    video.release()
    print(f"✅ MP4 video saved to: {output_path}")

# --- Phoneme Duration Prediction ---

def predict_phoneme_durations(phonemes, base_duration=0.1):
    """
    Predict durations for each phoneme.
    This is a simple rule-based predictor assigning a base duration to each phoneme.
    """
    durations = []
    for phoneme in phonemes:
        # Assign longer duration to vowels
        if phoneme in ['aa', 'ae', 'ah', 'eh', 'ih', 'iy', 'er', 'ey', 'uh', 'uw', 'aw', 'ow', 'oy', 'ay']:
            durations.append(base_duration * 1.5)
        else:
            durations.append(base_duration)
    return durations

# --- Main generation logic ---

def generate_from_word(word, base_model_dir, save_dir, fps=25):
    phonemes = get_phonemes_from_cmudict(word)
    print(f"Phonemes: {phonemes}")

    durations = predict_phoneme_durations(phonemes)
    print(f"Predicted durations: {durations}")

    output_path = os.path.join(save_dir, word)
    os.makedirs(output_path, exist_ok=True)

    frame_idx = 0

    for phoneme, duration in zip(phonemes, durations):
        viseme_class = phoneme_to_mouth_shape.get(phoneme)
        if not viseme_class:
            print(f"Skipping unknown phoneme: {phoneme}")
            continue

        generator_path = os.path.join(base_model_dir, viseme_class)
        if not os.path.exists(generator_path):
            print(f"GAN model for {viseme_class} not found, skipping.")
            continue

        generator = load_gan_model(generator_path)

        # Calculate number of frames for this phoneme based on duration and fps
        num_frames = max(1, int(duration * fps))

        for _ in range(num_frames):
            generated_clip = generate_lip_frames(generator)

            if generated_clip.ndim == 5:
                frame = generated_clip[0, 0]
            elif generated_clip.ndim == 4:
                frame = generated_clip[0]
            else:
                frame = generated_clip

            save_path = os.path.join(output_path, f"frame_{frame_idx:03d}.png")
            save_as_png(frame, save_path)
            frame_idx += 1

    print(f"✅ PNG frames saved to: {output_path}")

    # Create GIF and MP4
    gif_path = os.path.join(save_dir, f"{word}.gif")
    mp4_path = os.path.join(save_dir, f"{word}.mp4")
    create_gif_from_frames(output_path, gif_path, duration=int(1000 / fps))
    create_mp4_from_frames(output_path, mp4_path, fps=fps)

    return output_path

# --- Run ---

if __name__ == "__main__":
    word = input("Enter a word: ").strip()
    base_model_dir = "/content/drive/MyDrive/All_outputs/gans_allfolders"
    save_dir = "/content/drive/MyDrive/All_outputs/mergeGANS"
    generate_from_word(word, base_model_dir, save_dir)


[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


Enter a word: map
Phonemes: ['m', 'ae', 'p']
Predicted durations: [0.1, 0.15000000000000002, 0.1]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
✅ PNG frames saved to: /content/drive/MyDrive/All_outputs/mergeGANS/map
✅ GIF saved to: /content/drive/MyDrive/All_outputs/mergeGANS/map.gif
✅ MP4 video saved to: /content/drive/MyDrive/All_outputs/mergeGANS/map.mp4


In [None]:
# /content/drive/MyDrive/Dataset/Screenshot 2025-05-01 185610.png

In [None]:
generator_epoch_100.model.summary()


NameError: name 'generator_epoch_100' is not defined