<a href="https://colab.research.google.com/github/morechaitanya606/manga-ocr-video-generator/blob/main/manhwa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install moviepy==1.0.3 imageio[ffmpeg]
!pip install pymupdf easyocr gTTS transformers torch
!pip install -q edge-tts asyncio nest-asyncio
!pip install -q opencv-python numpy scikit-image
!pip install -q sentence-transformers accelerate pillow


Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ea

In [None]:
!pip install sacremoses

In [None]:
#!/usr/bin/env python3
"""
AI Manga Recap Test Script (Colab-Optimized)
- Extracts pages from a PDF
- Uses OCR to get text
- AI model generates Hindi narration
- TTS + video generation
"""

# ================================
# ✅ Install dependencies
# ================================
#!pip install moviepy pillow easyocr PyMuPDF gTTS transformers

import os
import fitz  # PyMuPDF
import easyocr
from gtts import gTTS
import moviepy.editor as mp
from transformers import AutoTokenizer, pipeline
from PIL import Image

# ---------- CONFIG ----------
PDF_PATH = "/content/Ch_199_Side_Story_20.pdf"   # Upload your PDF in Colab
OUTPUT_DIR = "output"
CHAPTER = 1
MODEL_NAME = "google/flan-t5-base"  # Lightweight AI model
# ----------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ================================
# 1. Extract first few pages as images
# ================================
def extract_pdf_images(pdf_path, max_pages=2):
    doc = fitz.open(pdf_path)
    images = []
    for i in range(min(len(doc), max_pages)):
        page = doc.load_page(i)
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        img_path = os.path.join(OUTPUT_DIR, f"page_{i+1}.jpg")
        pix.save(img_path)
        images.append(img_path)
    doc.close()
    return images

# ================================
# 2. OCR text from pages
# ================================
def ocr_text(images):
    reader = easyocr.Reader(['en', 'hi'])  # English + Hindi OCR
    all_text = []
    for img in images:
        result = reader.readtext(img, detail=0)
        all_text.append(" ".join(result))
    return " ".join(all_text)

# ================================
# 3. Use AI model to generate Hindi recap
# ================================
# ================================
# 3. Use AI model to generate Hindi recap (improved)
# ================================
from transformers import MarianMTModel, MarianTokenizer

# 3. Generate clean Hindi script with translation + summarization
def generate_hindi_script(english_text):
    # --- Translation (EN -> HI) ---
    model_name = "Helsinki-NLP/opus-mt-en-hi"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    translated = model.generate(
        **tokenizer(english_text, return_tensors="pt", padding=True, truncation=True)
    )
    hindi_text = tokenizer.decode(translated[0], skip_special_tokens=True)

    # --- Summarization in Hindi ---
    summarizer = pipeline("summarization", model="google/pegasus-xsum")
    summary = summarizer(
        hindi_text,
        max_length=80,
        min_length=30,
        do_sample=False
    )[0]["summary_text"]

    # --- Clean output ---
    summary = summary.replace("\n", " ").strip()
    if not summary:
        summary = "यह अध्याय बहुत रोमांचक है और इसमें कहानी आगे बढ़ती है।"

    return summary



# ================================
# 4. Convert to speech
# ================================
def create_audio(text, out_file):
    if not text.strip():
        text = "यह अध्याय रोमांचक है।"  # default safe text
    tts = gTTS(text=text, lang="hi")
    tts.save(out_file)
    return out_file

# ================================
# 5. Resize oversized images
# ================================
def resize_image(input_path, max_width=1080, max_height=1920):
    img = Image.open(input_path)
    w, h = img.size
    if w > max_width or h > max_height:
        img.thumbnail((max_width, max_height), Image.Resampling.LANCZOS)
        img.save(input_path)
    return input_path

# ================================
# 6. Make video
# ================================
# 5. Make video in 16:9 format
# 5. Make video in 16:9 format
def make_video(images, audio_file, out_file="test_video.mp4"):
    audio_clip = mp.AudioFileClip(audio_file)
    duration = audio_clip.duration / len(images)

    clips = []
    for img in images:
        clip = mp.ImageClip(img)

        # Scale to fit within 1920x1080
        if clip.w > clip.h:
            clip = clip.resize(width=1920)
        else:
            clip = clip.resize(height=1080)

        # Pad to exactly 1920x1080 (letterbox)
        clip = clip.on_color(
            size=(1920, 1080),
            color=(0, 0, 0),  # black background
            pos=("center", "center")
        )

        clip = clip.set_duration(duration)
        clips.append(clip)

    video = mp.concatenate_videoclips(clips, method="compose")
    final = video.set_audio(audio_clip)
    final.write_videofile(
        out_file,
        fps=30,
        codec="libx264",
        audio_codec="aac",
        threads=4
    )
    return out_file



# ================================
# 🚀 MAIN
# ================================
if __name__ == "__main__":
    print("📖 Extracting images...")
    imgs = extract_pdf_images(PDF_PATH, max_pages=2)

    print("🔎 Running OCR...")
    raw_text = ocr_text(imgs)

    print("🤖 Generating Hindi script...")
    script_text = generate_hindi_script(raw_text)
    print("Generated Script:", script_text)

    print("🎙️ Creating audio...")
    audio = create_audio(script_text, os.path.join(OUTPUT_DIR, "chapter_audio.mp3"))

    print("🎬 Making video...")
    video_path = make_video(imgs, audio, os.path.join(OUTPUT_DIR, f"chapter_{CHAPTER}.mp4"))

    print(f"✅ Done! Video saved at: {video_path}")


📖 Extracting images...




🔎 Running OCR...






🤖 Generating Hindi script...



Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Your max_length is set to 80, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


Generated Script: यह अध्याय बहुत रोमांचक है और इसमें कहानी आगे बढ़ती है।
🎙️ Creating audio...
🎬 Making video...
Moviepy - Building video output/chapter_1.mp4.
MoviePy - Writing audio in chapter_1TEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video output/chapter_1.mp4





Moviepy - Done !
Moviepy - video ready output/chapter_1.mp4
✅ Done! Video saved at: output/chapter_1.mp4


In [6]:
#!/usr/bin/env python3
"""
Enhanced AI Manga Recap Generator (Colab-Optimized for T4 GPU)
- Extracts high-quality pages from PDF
- Advanced OCR with noise reduction
- Better AI models for content generation
- High-quality TTS with multiple voices
- Professional video generation with effects
"""

# ================================
# ✅ Install dependencies (run this cell first)
# ================================
"""
!pip install -q moviepy pillow easyocr PyMuPDF transformers torch accelerate
!pip install -q edge-tts asyncio nest-asyncio
!pip install -q opencv-python numpy scikit-image
!pip install -q sentence-transformers
"""

import os
import sys
import asyncio
import nest_asyncio
import fitz  # PyMuPDF
import easyocr
import edge_tts
import moviepy.editor as mp
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from PIL import Image, ImageEnhance, ImageFilter
import cv2
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import re
import json

# Enable nested asyncio for Colab
nest_asyncio.apply()

# ================================
# 🎯 ENHANCED CONFIG
# ================================
PDF_PATH = "/content/Ch_199_Side_Story_20.pdf"   # Upload your PDF in Colab
OUTPUT_DIR = "output"
CHAPTER = 1

# AI Models (optimized for T4)
SUMMARIZATION_MODEL = "facebook/bart-large-cnn"  # Better summarization
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-en-hi"  # English to Hindi

# Voice settings (Edge TTS - high quality, free)
VOICE_OPTIONS = [
    "hi-IN-MadhurNeural",     # Male, clear
    "hi-IN-SwaraNeural",      # Female, expressive
    "hi-IN-AnanyaNeural"      # Female, warm
]
SELECTED_VOICE = VOICE_OPTIONS[0]  # Change index for different voice

# Video settings
VIDEO_WIDTH = 1920
VIDEO_HEIGHT = 1080
FPS = 30
BITRATE = "8000k"  # High quality for YouTube

# ================================
# 🔧 UTILITY FUNCTIONS
# ================================
def setup_environment():
    """Setup GPU and create directories"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"🚀 Using device: {device}")
    return device

def preprocess_image(img_path):
    """Enhance image quality for better OCR"""
    img = cv2.imread(img_path)

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Denoise
    denoised = cv2.fastNlMeansDenoising(gray)

    # Enhance contrast
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    enhanced = clahe.apply(denoised)

    # Sharpen
    kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
    sharpened = cv2.filter2D(enhanced, -1, kernel)

    # Save enhanced image
    enhanced_path = img_path.replace('.jpg', '_enhanced.jpg')
    cv2.imwrite(enhanced_path, sharpened)
    return enhanced_path

# ================================
# 📖 PDF EXTRACTION (ENHANCED)
from PIL import Image
Image.MAX_IMAGE_PIXELS = None  # disable DecompressionBombWarning

# ================================
def extract_pdf_images(pdf_path, max_pages=5):
    """Extract high-quality images from PDF (skip if already extracted)."""
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF not found: {pdf_path}")

    images = []

    doc = fitz.open(pdf_path)
    for i in range(min(len(doc), max_pages)):
        img_path = os.path.join(OUTPUT_DIR, f"page_{i+1}.jpg")
        enhanced_path = img_path.replace(".jpg", "_enhanced.jpg")

        if os.path.exists(enhanced_path):
            print(f"⏩ Skipped extraction, using cached: page {i+1}")
            images.append(enhanced_path)
            continue

        # Reduce zoom to avoid gigantic images
        mat = fitz.Matrix(2.0, 2.0)
        pix = doc.load_page(i).get_pixmap(matrix=mat, alpha=False)
        pix.save(img_path)

        # Auto-resize if image is still too large
        img = cv2.imread(img_path)
        h, w = img.shape[:2]
        max_dim = 4000  # keep under 4k px per side
        if max(h, w) > max_dim:
            scale = max_dim / max(h, w)
            resized = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
            cv2.imwrite(img_path, resized)

        enhanced_path = preprocess_image(img_path)
        images.append(enhanced_path)
        print(f"✅ Extracted page {i+1} ({w}x{h} → resized if needed)")

    doc.close()
    return images


# ================================
# 🔍 ADVANCED OCR
# ================================
def advanced_ocr(images):
    """Perform OCR with EasyOCR (auto downscale large images)."""
    reader = easyocr.Reader(["en"], gpu=True)
    all_text = []

    for idx, img_path in enumerate(images, 1):
        print(f"🔎 Processing OCR for page {idx}...")

        img = cv2.imread(img_path)

        # ✅ Safety resize if image too big for OpenCV warp
        h, w = img.shape[:2]
        if h > 3000 or w > 3000:
            scale = 2000 / max(h, w)
            img = cv2.resize(img, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)
            print(f"⚠️ Resized page {idx} for OCR: {w}x{h} → {img.shape[1]}x{img.shape[0]}")

        try:
            results = reader.readtext(img, detail=1, paragraph=True)
            page_text = " ".join([res[1] for res in results])
            print(f"📄 Page {idx}: {len(page_text)} characters extracted")
            all_text.append(page_text)
        except Exception as e:
            print(f"❌ OCR failed on page {idx}: {e}")
            all_text.append("")

    return "\n".join(all_text)



# ================================
# 🤖 ENHANCED AI CONTENT GENERATION
# ================================
class ContentGenerator:
    def __init__(self, device):
        self.device = device
        print("🤖 Loading AI models...")

        # Summarization model
        self.summarizer = pipeline(
            "summarization",
            model=SUMMARIZATION_MODEL,
            device=0 if device == "cuda" else -1,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32
        )

        # Translation model
        self.translator_tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL)
        self.translator_model = AutoModelForSeq2SeqLM.from_pretrained(TRANSLATION_MODEL)

        if device == "cuda":
            self.translator_model = self.translator_model.half().cuda()

        print("✅ AI models loaded successfully!")

    def generate_engaging_script(self, raw_text):
        """Generate engaging Hindi narration"""
        if not raw_text or len(raw_text) < 50:
            return "इस अध्याय में एक रोमांचक कहानी है जो आपको बहुत पसंद आएगी।"

        try:
            # Step 1: Summarize in English (better quality)
            print("📝 Generating summary...")

            # Split text if too long
            max_chunk = 1000
            if len(raw_text) > max_chunk:
                chunks = [raw_text[i:i+max_chunk] for i in range(0, len(raw_text), max_chunk)]
                summaries = []
                for chunk in chunks:
                    if len(chunk) > 50:
                        summary = self.summarizer(
                            chunk,
                            max_length=100,
                            min_length=30,
                            do_sample=False
                        )[0]['summary_text']
                        summaries.append(summary)
                english_summary = " ".join(summaries)
            else:
                english_summary = self.summarizer(
                    raw_text,
                    max_length=150,
                    min_length=40,
                    do_sample=False
                )[0]['summary_text']

            # Step 2: Enhance for storytelling
            storytelling_prompt = f"""
            Transform this manga summary into an engaging Hindi narration for YouTube:
            {english_summary}

            Make it dramatic and exciting for viewers.
            """

            # Step 3: Translate to Hindi
            print("🌍 Translating to Hindi...")
            inputs = self.translator_tokenizer(
                english_summary,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            )

            if self.device == "cuda":
                inputs = {k: v.cuda() for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.translator_model.generate(
                    **inputs,
                    max_length=200,
                    num_beams=4,
                    temperature=0.7,
                    do_sample=True
                )

            hindi_text = self.translator_tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Step 4: Add engaging elements
            hindi_text = self.enhance_narration(hindi_text)

            # Save generated script
            with open(os.path.join(OUTPUT_DIR, "generated_script.txt"), "w", encoding="utf-8") as f:
                f.write(f"English Summary:\n{english_summary}\n\nHindi Script:\n{hindi_text}")

            return hindi_text

        except Exception as e:
            print(f"⚠️ AI generation failed: {e}")
            return "इस अध्याय में बहुत ही रोमांचक घटनाएं हैं जो आपको हैरान कर देंगी। कहानी में नए मोड़ आते हैं और किरदार अपनी यात्रा में आगे बढ़ते हैं।"

    def enhance_narration(self, text):
        """Add dramatic elements to Hindi narration"""
        # Add pauses and emphasis
        text = text.replace("।", "... ")
        text = text.replace(",", "... ")

        # Add engaging intro/outro
        intros = [
            "आज के इस रोमांचक अध्याय में... ",
            "इस शानदार कहानी में आगे... ",
            "अब देखते हैं क्या होता है... "
        ]

        outros = [
            "... यह कहानी कितनी दिलचस्प है!",
            "... आगे और भी रोमांच का इंतजार है!",
            "... यह अध्याय वाकई में शानदार था!"
        ]

        enhanced = f"{intros[0]}{text}{outros[0]}"
        return enhanced

# ================================
# 🎙️ HIGH-QUALITY TTS
# ================================
async def create_high_quality_audio(text, output_path):
    """Generate high-quality audio using Edge TTS"""
    if not text.strip():
        text = "यह अध्याय बहुत ही रोमांचक और दिलचस्प है।"

    print(f"🎙️ Generating audio with voice: {SELECTED_VOICE}")

    # Configure speech parameters for better quality
    communicate = edge_tts.Communicate(
        text=text,
        voice=SELECTED_VOICE,
        rate="-10%",  # Slightly slower for clarity
        volume="+0%",
        pitch="+0Hz"
    )

    await communicate.save(output_path)
    print(f"✅ Audio saved: {output_path}")
    return output_path

# ================================
# 🎬 PROFESSIONAL VIDEO CREATION
# ================================
import cv2
import moviepy.editor as mp
from PIL import Image

# Define your constants somewhere in your script
VIDEO_WIDTH = 1920
VIDEO_HEIGHT = 1080
FPS = 30
BITRATE = "5000k"

def create_professional_video(images, audio_file, output_file):
    """Create high-quality video with effects"""
    print("🎬 Creating professional video...")

    # Load audio
    audio_clip = mp.AudioFileClip(audio_file)
    total_duration = audio_clip.duration

    # Calculate timing
    num_images = len(images)
    base_duration = total_duration / num_images

    clips = []
    for i, img_path in enumerate(images):
        print(f"📸 Processing image {i+1}/{num_images}")

        # ✅ Force RGB to fix grayscale issues
        img = Image.open(img_path).convert("RGB")
        safe_path = img_path.replace(".jpg", "_rgb.jpg").replace(".png", "_rgb.png")
        img.save(safe_path)

        # Create image clip
        img_clip = mp.ImageClip(safe_path, duration=base_duration)

        # Resize and pad to exact dimensions
        img_clip = img_clip.resize(height=VIDEO_HEIGHT)
        if img_clip.w > VIDEO_WIDTH:
            img_clip = img_clip.resize(width=VIDEO_WIDTH)

        # Center the image with black padding
        img_clip = img_clip.on_color(
            size=(VIDEO_WIDTH, VIDEO_HEIGHT),
            color=(0, 0, 0),
            pos="center"
        )

        # Add subtle zoom effect
        def zoom_effect(get_frame, t):
            frame = get_frame(t)
            zoom_factor = 1 + (t / base_duration) * 0.1  # 10% zoom over duration
            h, w = frame.shape[:2]
            new_h, new_w = int(h * zoom_factor), int(w * zoom_factor)

            if new_h > h and new_w > w:
                frame = cv2.resize(frame, (new_w, new_h))
                start_x = (new_w - w) // 2
                start_y = (new_h - h) // 2
                frame = frame[start_y:start_y+h, start_x:start_x+w]

            return frame

        img_clip = img_clip.fl(zoom_effect)

        # Add fade transitions (except for first and last)
        if i > 0:
            img_clip = img_clip.fadein(0.5)
        if i < num_images - 1:
            img_clip = img_clip.fadeout(0.5)

        clips.append(img_clip)

    # Concatenate clips
    final_video = mp.concatenate_videoclips(clips, method="compose")

    # Add audio
    final_video = final_video.set_audio(audio_clip)

    # Export with high quality settings
    final_video.write_videofile(
        output_file,
        fps=FPS,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile=f"{output_file}_temp_audio.m4a",
        remove_temp=True,
        bitrate=BITRATE,
        preset="medium",  # Good balance of quality and encoding speed
        ffmpeg_params=[
            "-crf", "18",      # High quality (lower = better quality)
            "-profile:v", "high",
            "-level", "4.0",
            "-pix_fmt", "yuv420p"  # YouTube compatibility
        ],
        threads=4
    )

    print(f"✅ Professional video created: {output_file}")
    return output_file

# ================================
# 🚀 MAIN EXECUTION
# ================================
async def main():
    print("🎬 AI Manga Recap Generator - Professional Edition")
    print("=" * 60)

    # Setup
    device = setup_environment()

    # Check if PDF exists
    if not os.path.exists(PDF_PATH):
        print(f"❌ Please upload your PDF to: {PDF_PATH}")
        return

    try:
        # Step 1: Extract images
        print("\n📖 Step 1: Extracting PDF pages...")
        images = extract_pdf_images(PDF_PATH, max_pages=4)
        print(f"✅ Extracted {len(images)} pages")

        # Step 2: OCR
        print("\n🔍 Step 2: Advanced OCR processing...")
        raw_text = advanced_ocr(images)
        print(f"✅ Extracted {len(raw_text)} characters of text")

        if len(raw_text) < 20:
            print("⚠️ Limited text found. Using fallback content.")

        # Step 3: Generate content
        print("\n🤖 Step 3: AI content generation...")
        generator = ContentGenerator(device)
        hindi_script = generator.generate_engaging_script(raw_text)
        print(f"✅ Generated script: {len(hindi_script)} characters")
        print(f"Preview: {hindi_script[:100]}...")

        # Step 4: Create audio
        print("\n🎙️ Step 4: High-quality TTS generation...")
        audio_path = os.path.join(OUTPUT_DIR, "narration.wav")
        await create_high_quality_audio(hindi_script, audio_path)

        # Step 5: Create video
        print("\n🎬 Step 5: Professional video creation...")
        video_path = os.path.join(OUTPUT_DIR, f"manga_recap_chapter_{CHAPTER}.mp4")
        create_professional_video(images, audio_path, video_path)

        # Summary
        print("\n" + "=" * 60)
        print("🎉 SUCCESS! Video generation complete!")
        print(f"📁 Output directory: {OUTPUT_DIR}")
        print(f"🎥 Video file: {video_path}")
        print(f"📝 Script file: {os.path.join(OUTPUT_DIR, 'generated_script.txt')}")
        print(f"🔊 Audio file: {audio_path}")

        # File size info
        if os.path.exists(video_path):
            size_mb = os.path.getsize(video_path) / (1024 * 1024)
            print(f"📊 Video size: {size_mb:.2f} MB")

        print("🚀 Ready for YouTube upload!")

    except Exception as e:
        print(f"❌ Error during processing: {e}")
        import traceback
        traceback.print_exc()

# ================================
# 🏃 RUN
# ================================
if __name__ == "__main__":
    # Run the async main function
    asyncio.run(main())

🎬 AI Manga Recap Generator - Professional Edition
🚀 Using device: cuda

📖 Step 1: Extracting PDF pages...
⏩ Skipped extraction, using cached: page 1
⏩ Skipped extraction, using cached: page 2
⏩ Skipped extraction, using cached: page 3
⏩ Skipped extraction, using cached: page 4
✅ Extracted 4 pages

🔍 Step 2: Advanced OCR processing...
🔎 Processing OCR for page 1...
⚠️ Resized page 1 for OCR: 2316x43197 → 107x2000
📄 Page 1: 5 characters extracted
🔎 Processing OCR for page 2...
⚠️ Resized page 2 for OCR: 2400x42645 → 112x2000
📄 Page 2: 12 characters extracted
🔎 Processing OCR for page 3...
⚠️ Resized page 3 for OCR: 2400x15801 → 303x2000
📄 Page 3: 0 characters extracted
🔎 Processing OCR for page 4...
⚠️ Resized page 4 for OCR: 2304x43197 → 106x2000
📄 Page 4: 10 characters extracted
✅ Extracted 30 characters of text

🤖 Step 3: AI content generation...
🤖 Loading AI models...


Device set to use cuda:0


✅ AI models loaded successfully!
✅ Generated script: 57 characters
Preview: इस अध्याय में एक रोमांचक कहानी है जो आपको बहुत पसंद आएगी।...

🎙️ Step 4: High-quality TTS generation...
🎙️ Generating audio with voice: hi-IN-MadhurNeural
✅ Audio saved: output/narration.wav

🎬 Step 5: Professional video creation...
🎬 Creating professional video...
📸 Processing image 1/4
📸 Processing image 2/4
📸 Processing image 3/4
📸 Processing image 4/4
Moviepy - Building video output/manga_recap_chapter_1.mp4.
MoviePy - Writing audio in output/manga_recap_chapter_1.mp4_temp_audio.m4a




MoviePy - Done.
Moviepy - Writing video output/manga_recap_chapter_1.mp4





Moviepy - Done !
Moviepy - video ready output/manga_recap_chapter_1.mp4
✅ Professional video created: output/manga_recap_chapter_1.mp4

🎉 SUCCESS! Video generation complete!
📁 Output directory: output
🎥 Video file: output/manga_recap_chapter_1.mp4
📝 Script file: output/generated_script.txt
🔊 Audio file: output/narration.wav
📊 Video size: 0.46 MB
🚀 Ready for YouTube upload!


In [10]:
#!/usr/bin/env python3
"""
Enhanced AI Manga Recap Generator (Colab-Optimized for T4 GPU)
- Extracts high-quality pages from PDF
- Advanced OCR with noise reduction
- Better AI models for content generation
- High-quality TTS with multiple voices
- Professional video generation with effects
"""

# ================================
# ✅ Install dependencies (run this cell first)
# ================================
"""
!pip install -q moviepy pillow easyocr PyMuPDF transformers torch accelerate
!pip install -q edge-tts asyncio nest-asyncio
!pip install -q opencv-python numpy scikit-image
!pip install -q sentence-transformers
"""

import os
import sys
import asyncio
import nest_asyncio
import fitz  # PyMuPDF
import easyocr
import edge_tts
import moviepy.editor as mp
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from PIL import Image, ImageEnhance, ImageFilter
import cv2
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import re
import json

# Enable nested asyncio for Colab
nest_asyncio.apply()

# ================================
# 🎯 ENHANCED CONFIG
# ================================
PDF_PATH = "/content/Ch_199_Side_Story_20.pdf"   # Upload your PDF in Colab
OUTPUT_DIR = "output"
CHAPTER = 1

# AI Models (optimized for T4)
SUMMARIZATION_MODEL = "facebook/bart-large-cnn"  # Better summarization
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-en-hi"  # English to Hindi

# Voice settings (Edge TTS - high quality, free)
VOICE_OPTIONS = [
    "hi-IN-MadhurNeural",     # Male, clear
    "hi-IN-SwaraNeural",      # Female, expressive
    "hi-IN-AnanyaNeural"      # Female, warm
]
SELECTED_VOICE = VOICE_OPTIONS[0]  # Change index for different voice

# Video settings
VIDEO_WIDTH = 1920
VIDEO_HEIGHT = 1080
FPS = 30
BITRATE = "8000k"  # High quality for YouTube

# ================================
# 🔧 UTILITY FUNCTIONS
# ================================
def setup_environment():
    """Setup GPU and create directories"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"🚀 Using device: {device}")
    return device

def preprocess_image(img_path):
    """Enhance image quality for better OCR"""
    img = cv2.imread(img_path)

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Denoise
    denoised = cv2.fastNlMeansDenoising(gray)

    # Enhance contrast
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    enhanced = clahe.apply(denoised)

    # Sharpen
    kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
    sharpened = cv2.filter2D(enhanced, -1, kernel)

    # Save enhanced image
    enhanced_path = img_path.replace('.jpg', '_enhanced.jpg')
    cv2.imwrite(enhanced_path, sharpened)
    return enhanced_path

# ================================
# 📖 PDF EXTRACTION (ENHANCED)
# ================================
def extract_pdf_images(pdf_path, max_pages=5):
    """Extract high-quality images from PDF"""
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF not found: {pdf_path}")

    doc = fitz.open(pdf_path)
    images = []

    for i in range(min(len(doc), max_pages)):
        page = doc.load_page(i)

        # High-resolution matrix for better quality
        mat = fitz.Matrix(3.0, 3.0)  # 3x zoom for crisp images
        pix = page.get_pixmap(matrix=mat, alpha=False)

        img_path = os.path.join(OUTPUT_DIR, f"page_{i+1}.jpg")
        pix.save(img_path)

        # Enhance image for OCR
        enhanced_path = preprocess_image(img_path)
        images.append(enhanced_path)

        print(f"✅ Extracted page {i+1}")

    doc.close()
    return images

# ================================
# 🔍 ADVANCED OCR
# ================================
def advanced_ocr(images):
    """Enhanced OCR with text cleaning"""
    reader = easyocr.Reader(['en', 'hi'], gpu=torch.cuda.is_available())
    all_text = []

    for idx, img in enumerate(images):
        print(f"🔎 Processing OCR for page {idx+1}...")

        # Get OCR results with confidence scores
        results = reader.readtext(img, detail=1, paragraph=True)

        # Filter by confidence threshold
        filtered_text = []
        for (bbox, text, confidence) in results:
            if confidence > 0.5:  # Only high-confidence text
                # Clean text
                clean_text = re.sub(r'[^\w\s\u0900-\u097F]', ' ', text)
                clean_text = re.sub(r'\s+', ' ', clean_text).strip()
                if len(clean_text) > 2:  # Skip very short text
                    filtered_text.append(clean_text)

        page_text = " ".join(filtered_text)
        all_text.append(page_text)
        print(f"📄 Page {idx+1}: {len(page_text)} characters extracted")

    combined_text = " ".join(all_text)

    # Save extracted text for debugging
    with open(os.path.join(OUTPUT_DIR, "extracted_text.txt"), "w", encoding="utf-8") as f:
        f.write(combined_text)

    return combined_text

# ================================
# 🤖 ENHANCED AI CONTENT GENERATION
# ================================
class ContentGenerator:
    def __init__(self, device):
        self.device = device
        print("🤖 Loading AI models...")

        # Summarization model
        self.summarizer = pipeline(
            "summarization",
            model=SUMMARIZATION_MODEL,
            device=0 if device == "cuda" else -1,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32
        )

        # Translation model
        self.translator_tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL)
        self.translator_model = AutoModelForSeq2SeqLM.from_pretrained(TRANSLATION_MODEL)

        if device == "cuda":
            self.translator_model = self.translator_model.half().cuda()

        print("✅ AI models loaded successfully!")

    def generate_engaging_script(self, raw_text):
        """Generate engaging Hindi narration"""
        if not raw_text or len(raw_text) < 50:
            return "इस अध्याय में एक रोमांचक कहानी है जो आपको बहुत पसंद आएगी।"

        try:
            # Step 1: Summarize in English (better quality)
            print("📝 Generating summary...")

            # Split text if too long
            max_chunk = 1000
            if len(raw_text) > max_chunk:
                chunks = [raw_text[i:i+max_chunk] for i in range(0, len(raw_text), max_chunk)]
                summaries = []
                for chunk in chunks:
                    if len(chunk) > 50:
                        summary = self.summarizer(
                            chunk,
                            max_length=100,
                            min_length=30,
                            do_sample=False
                        )[0]['summary_text']
                        summaries.append(summary)
                english_summary = " ".join(summaries)
            else:
                english_summary = self.summarizer(
                    raw_text,
                    max_length=150,
                    min_length=40,
                    do_sample=False
                )[0]['summary_text']

            # Step 2: Enhance for storytelling
            storytelling_prompt = f"""
            Transform this manga summary into an engaging Hindi narration for YouTube:
            {english_summary}

            Make it dramatic and exciting for viewers.
            """

            # Step 3: Translate to Hindi
            print("🌍 Translating to Hindi...")
            inputs = self.translator_tokenizer(
                english_summary,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            )

            if self.device == "cuda":
                inputs = {k: v.cuda() for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.translator_model.generate(
                    **inputs,
                    max_length=200,
                    num_beams=4,
                    temperature=0.7,
                    do_sample=True
                )

            hindi_text = self.translator_tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Step 4: Add engaging elements
            hindi_text = self.enhance_narration(hindi_text)

            # Save generated script
            with open(os.path.join(OUTPUT_DIR, "generated_script.txt"), "w", encoding="utf-8") as f:
                f.write(f"English Summary:\n{english_summary}\n\nHindi Script:\n{hindi_text}")

            return hindi_text

        except Exception as e:
            print(f"⚠️ AI generation failed: {e}")
            return "इस अध्याय में बहुत ही रोमांचक घटनाएं हैं जो आपको हैरान कर देंगी। कहानी में नए मोड़ आते हैं और किरदार अपनी यात्रा में आगे बढ़ते हैं।"

    def enhance_narration(self, text):
        """Add dramatic elements to Hindi narration"""
        # Add pauses and emphasis
        text = text.replace("।", "... ")
        text = text.replace(",", "... ")

        # Add engaging intro/outro
        intros = [
            "आज के इस रोमांचक अध्याय में... ",
            "इस शानदार कहानी में आगे... ",
            "अब देखते हैं क्या होता है... "
        ]

        outros = [
            "... यह कहानी कितनी दिलचस्प है!",
            "... आगे और भी रोमांच का इंतजार है!",
            "... यह अध्याय वाकई में शानदार था!"
        ]

        enhanced = f"{intros[0]}{text}{outros[0]}"
        return enhanced

# ================================
# 🎙️ HIGH-QUALITY TTS
# ================================
async def create_high_quality_audio(text, output_path):
    """Generate high-quality audio using Edge TTS"""
    if not text.strip():
        text = "यह अध्याय बहुत ही रोमांचक और दिलचस्प है।"

    print(f"🎙️ Generating audio with voice: {SELECTED_VOICE}")

    # Configure speech parameters for better quality
    communicate = edge_tts.Communicate(
        text=text,
        voice=SELECTED_VOICE,
        rate="-10%",  # Slightly slower for clarity
        volume="+0%",
        pitch="+0Hz"
    )

    await communicate.save(output_path)
    print(f"✅ Audio saved: {output_path}")
    return output_path

# ================================
# 🎬 PROFESSIONAL VIDEO CREATION
# ================================
def create_professional_video(images, audio_file, output_file):
    """Create high-quality video with effects"""
    print("🎬 Creating professional video...")

    # Load audio
    audio_clip = mp.AudioFileClip(audio_file)
    total_duration = audio_clip.duration

    # Calculate timing
    num_images = len(images)
    base_duration = total_duration / num_images

    clips = []
    for i, img_path in enumerate(images):
        print(f"📸 Processing image {i+1}/{num_images}")

        # Create image clip
        img_clip = mp.ImageClip(img_path, duration=base_duration)

        # Resize and pad to exact dimensions
        img_clip = img_clip.resize(height=VIDEO_HEIGHT)

        if img_clip.w > VIDEO_WIDTH:
            img_clip = img_clip.resize(width=VIDEO_WIDTH)

        # Center the image with black padding
        img_clip = img_clip.on_color(
            size=(VIDEO_WIDTH, VIDEO_HEIGHT),
            color=(0, 0, 0),
            pos='center'
        )

        # Add subtle zoom effect
        def zoom_effect(get_frame, t):
            frame = get_frame(t)
            zoom_factor = 1 + (t / base_duration) * 0.1  # 10% zoom over duration
            h, w = frame.shape[:2]
            new_h, new_w = int(h * zoom_factor), int(w * zoom_factor)

            if new_h > h and new_w > w:
                frame = cv2.resize(frame, (new_w, new_h))
                start_x = (new_w - w) // 2
                start_y = (new_h - h) // 2
                frame = frame[start_y:start_y+h, start_x:start_x+w]

            return frame

        img_clip = img_clip.fl(zoom_effect)

        # Add fade transitions (except for first and last)
        if i > 0:
            img_clip = img_clip.fadein(0.5)
        if i < num_images - 1:
            img_clip = img_clip.fadeout(0.5)

        clips.append(img_clip)

    # Concatenate clips
    final_video = mp.concatenate_videoclips(clips, method="compose")

    # Add audio
    final_video = final_video.set_audio(audio_clip)

    # Export with high quality settings
    final_video.write_videofile(
        output_file,
        fps=FPS,
        codec='libx264',
        audio_codec='aac',
        temp_audiofile=f"{output_file}_temp_audio.m4a",
        remove_temp=True,
        bitrate=BITRATE,
        preset='medium',  # Good balance of quality and encoding speed
        ffmpeg_params=[
            '-crf', '18',  # High quality (lower = better quality)
            '-profile:v', 'high',
            '-level', '4.0',
            '-pix_fmt', 'yuv420p'  # YouTube compatibility
        ],
        threads=4
    )

    print(f"✅ Professional video created: {output_file}")
    return output_file

# ================================
# 🚀 MAIN EXECUTION
# ================================
async def main():
    print("🎬 AI Manga Recap Generator - Professional Edition")
    print("=" * 60)

    # Setup
    device = setup_environment()

    # Check if PDF exists
    if not os.path.exists(PDF_PATH):
        print(f"❌ Please upload your PDF to: {PDF_PATH}")
        return

    try:
        # Step 1: Extract images
        print("\n📖 Step 1: Extracting PDF pages...")
        images = extract_pdf_images(PDF_PATH, max_pages=6)  # More pages for longer content
        print(f"✅ Extracted {len(images)} pages")

        # Step 2: OCR
        print("\n🔍 Step 2: Advanced OCR processing...")
        raw_text = advanced_ocr(images)
        print(f"✅ Extracted {len(raw_text)} characters of text")

        if len(raw_text) < 20:
            print("⚠️ Limited text found. Using fallback content.")

        # Step 3: Generate content
        print("\n🤖 Step 3: AI content generation...")
        generator = ContentGenerator(device)
        hindi_script = generator.generate_engaging_script(raw_text)
        print(f"✅ Generated script: {len(hindi_script)} characters")
        print(f"Preview: {hindi_script[:100]}...")

        # Step 4: Create audio
        print("\n🎙️ Step 4: High-quality TTS generation...")
        audio_path = os.path.join(OUTPUT_DIR, "narration.wav")
        await create_high_quality_audio(hindi_script, audio_path)

        # Step 5: Create video
        print("\n🎬 Step 5: Professional video creation...")
        video_path = os.path.join(OUTPUT_DIR, f"manga_recap_chapter_{CHAPTER}.mp4")
        create_professional_video(images, audio_path, video_path)

        # Summary
        print("\n" + "=" * 60)
        print("🎉 SUCCESS! Video generation complete!")
        print(f"📁 Output directory: {OUTPUT_DIR}")
        print(f"🎥 Video file: {video_path}")
        print(f"📝 Script file: {os.path.join(OUTPUT_DIR, 'generated_script.txt')}")
        print(f"🔊 Audio file: {audio_path}")

        # File size info
        if os.path.exists(video_path):
            size_mb = os.path.getsize(video_path) / (1024 * 1024)
            print(f"📊 Video size: {size_mb:.2f} MB")

        print("🚀 Ready for YouTube upload!")

    except Exception as e:
        print(f"❌ Error during processing: {e}")
        import traceback
        traceback.print_exc()

# ================================
# 🏃 RUN
# ================================
if __name__ == "__main__":
    # Run the async main function
    asyncio.run(main())

🎬 AI Manga Recap Generator - Professional Edition
🚀 Using device: cuda
❌ Please upload your PDF to: /content/Ch_199_Side_Story_20.pdf
