In [None]:
import cv2
import torch
from transformers import LlavaProcessor, LlavaForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
from PIL import Image
import os
import json
import re
from collections import defaultdict

# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
VIDEO_PATH = "/content/test chunk.mp4" # IMPORTANT: Update this path to your video file
HF_TOKEN = ""  # Your Hugging Face Token
DESCRIPTION_FILE = "video_description.txt"
OUTPUT_FILE = "modifications.json"
LLAVA_MODEL = "llava-hf/llava-1.5-7b-hf"
RECOMMENDER_MODEL = "microsoft/DialoGPT-large"

# ==============================================================================
# 2. KNOWLEDGE DATABASES (Filters & Transitions)
# ==============================================================================
TRANSITIONS_DB = {
    "Cut": {"tags": ["standard"], "seo_match": ["continuation"], "geo_match": ["any"], "justification": "A standard, instant change for maintaining pace."},
    "Cross Dissolve": {"tags": ["gentle", "emotional"], "seo_match": ["time_passage", "reflective"], "geo_match": ["low_motion"], "justification": "A soft fade, ideal for showing passage of time."},
    "Wipe (Left to Right)": {"tags": ["stylized", "clean"], "seo_match": ["location_change", "progression"], "geo_match": ["any"], "justification": "A clean transition for showing a clear progression or scene change."},
    "Glitch": {"tags": ["high-energy", "modern", "abrupt"], "seo_match": ["tense", "action"], "geo_match": ["high_motion"], "justification": "A modern, chaotic effect for high-energy or tense sequences."},
    "Fade to Black": {"tags": ["dramatic", "ending"], "seo_match": ["end_of_act", "dramatic_pause"], "geo_match": ["any"], "justification": "A powerful transition to signify an end or major dramatic shift."}
}

FILTERS_DB = {
    # Full list from previous step
    "grayscale": {"tags": ["color_removal", "moody"], "seo_match": ["serious", "dramatic", "flashback", "historical"], "justification": "Enhances a serious or dramatic tone by removing color."},
    "invert colors": {"tags": ["stylized", "surreal"], "seo_match": ["dream_sequence", "psychedelic", "shock_effect"], "justification": "Creates a surreal or shocking effect by inverting the color palette."},
    "sepia": {"tags": ["warm", "vintage"], "seo_match": ["nostalgic", "historical", "memory", "flashback"], "justification": "Gives a warm, aged look, classic for flashbacks or historical scenes."},
    "pencil sketch": {"tags": ["artistic", "soft"], "seo_match": ["creative", "playful", "dream_sequence"], "justification": "Transforms the video into an artistic pencil sketch animation."},
    "cartoon effect": {"tags": ["artistic", "playful"], "seo_match": ["humorous", "lighthearted", "kids_content"], "justification": "Adds a fun, playful aesthetic by mimicking a cartoon style."},
    "color tint": {"tags": ["moody", "color_shift"], "seo_match": ["cold", "warm", "danger", "romantic"], "justification": "Sets a specific mood by overlaying a color (e.g., blue for cold, red for danger)."},
    "edge detection": {"tags": ["technical", "abstract"], "seo_match": ["surveillance", "tech_theme", "abstract"], "justification": "Highlights edges for a technical or abstract visual style."},
    "sobel x": {"tags": ["technical", "directional"], "seo_match": ["tech_theme", "data_visualization"], "justification": "Detects vertical edges, useful for technical or abstract sequences."},
    "sobel y": {"tags": ["technical", "directional"], "seo_match": ["tech_theme", "data_visualization"], "justification": "Detects horizontal edges, useful for technical or abstract sequences."},
    "laplacian": {"tags": ["technical", "high_contrast"], "seo_match": ["tech_theme", "abstract"], "justification": "A stronger edge detection method that finds edges in all directions."},
    "gaussian blur": {"tags": ["blur", "soften"], "seo_match": ["dream_sequence", "obscure_identity", "focus_shift"], "justification": "A standard blur to soften the image, create a dreamy look, or obscure details."},
    "median blur": {"tags": ["blur", "noise_reduction"], "seo_match": ["corrective", "clean_up"], "justification": "Effectively removes salt-and-pepper noise while preserving edges."},
    "bilateral filter": {"tags": ["blur", "edge_preserving"], "seo_match": ["corrective", "stylized_softness"], "justification": "Blurs the image while keeping edges sharp, good for a soft but clear look."},
    "motion blur": {"tags": ["blur", "high-energy"], "geo_match": ["high_motion", "action"], "seo_match": ["action", "speed"], "justification": "Adds blur to moving objects to emphasize speed and action."},
    "emboss filter": {"tags": ["stylized", "3d_effect"], "seo_match": ["artistic", "historical", "stone_carving_effect"], "justification": "Creates a 3D embossed or chiseled effect on the image."},
    "sharpen": {"tags": ["corrective", "clarity"], "seo_match": ["focus_enhancement", "detail_highlight"], "justification": "Increases the clarity and sharpness of the image to highlight details."},
    "hsv filter": {"tags": ["color_manipulation", "vibrant"], "seo_match": ["music_video", "surreal", "celebration"], "justification": "Manipulates Hue, Saturation, and Value for vibrant, custom color effects."},
    "negative hsv filter": {"tags": ["color_manipulation", "surreal"], "seo_match": ["psychedelic", "dream_sequence"], "justification": "Inverts the HSV values for a surreal and psychedelic color effect."},
    "thresholding": {"tags": ["binary", "high_contrast", "stark"], "seo_match": ["noir_style", "graphic_novel_look"], "justification": "Converts the image to pure black and white, creating a stark, high-contrast look."},
    "adaptive thresholding": {"tags": ["binary", "detailed"], "seo_match": ["noir_style", "technical"], "justification": "A more advanced thresholding that preserves details in varying light conditions."},
    "dilation": {"tags": ["binary", "bold"], "seo_match": ["abstract", "graphic_effect"], "justification": "Thickens the white areas of an image, often used for bold graphic or text effects."}
}

# ==============================================================================
# 3. VIDEO-TO-TEXT ANALYSIS (from Notebook)
# ==============================================================================
class VideoAnalyzer:
    def __init__(self, model_name=LLAVA_MODEL, hf_token=None):
        self.model_name = model_name
        self.processor = LlavaProcessor.from_pretrained(model_name, token=hf_token)
        self.model = LlavaForConditionalGeneration.from_pretrained(
            model_name,
            token=hf_token,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None,
            low_cpu_mem_usage=True
        )

    def extract_frames(self, video_path, frames_per_second=1):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video file: {video_path}")

        fps = cap.get(cv2.CAP_PROP_FPS)
        frames = []
        frame_interval = int(fps / frames_per_second) if fps > 0 else 1
        frame_count = 0

        while True:
            ret, frame = cap.read()
            if not ret: break
            if frame_count % frame_interval == 0:
                frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
            frame_count += 1
        cap.release()
        return frames

    def get_video_description(self, video_path, frames_per_second=1):
        frames = self.extract_frames(video_path, frames_per_second)
        if not frames: return "Error: No frames could be extracted from video"

        if len(frames) <= 4:
            images = frames
            image_tags = "<image>" * len(frames)
            prompt = f"USER: {image_tags}\\nAnalyze these video frames in chronological sequence and provide a comprehensive description of what happens throughout the entire video.\\nASSISTANT:"
        else:
            images = [frames[0], frames[len(frames) // 2], frames[-1]]
            prompt = "USER: <image><image><image>\\nAnalyze these video frames (beginning, middle, end) and provide a comprehensive description of what happens throughout the entire video.\\nASSISTANT:"

        inputs = self.processor(text=prompt, images=images, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_new_tokens=300)
        generated_text = self.processor.decode(outputs[0], skip_special_tokens=True)
        return generated_text.split("ASSISTANT:")[-1].strip()

# ==============================================================================
# 4. RECOMMENDATION ENGINE
# ==============================================================================
class EnhancementRecommender:
    """
    Combines high-level decision making with detailed recommendation logic.
    """
    def __init__(self, model_name=RECOMMENDER_MODEL, hf_token=None):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=hf_token,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None
        )
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def analyze_properties_from_description(self, description, scores):
        """NEW: Uses LLM to extract structured SEO/GEO properties from raw text."""
        prompt = f"""
        Analyze the following video description and attention scores.
        Extract the key properties: tone, pacing, narrative, and motion.
        - tone: Choose one from [serious, humorous, nostalgic, tense, action, lighthearted, dramatic, reflective].
        - pacing: Choose one from [slow, fast].
        - narrative: Choose one from [flashback, progression, story_moment, none].
        - motion: Choose one from [low, high].

        Description: "{description}"
        Motion Score: {scores['motion_score']:.2f}

        Based on this, output ONLY a JSON object with the keys "seo" and "geo".
        Example: {{"seo": {{"tone": "action", "pacing": "fast", "narrative": "story_moment"}}, "geo": {{"motion": "high"}}}}

        JSON:
        """
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs, max_new_tokens=100, pad_token_id=self.tokenizer.eos_token_id
            )
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        try:
            # Extract JSON part from the generated text
            json_str = generated_text[generated_text.find('{'):generated_text.rfind('}')+1]
            return json.loads(json_str)
        except json.JSONDecodeError:
            print("Warning: LLM failed to generate valid JSON for properties. Using fallback.")
            # Fallback based on scores
            return {
                "seo": {"tone": "neutral", "pacing": "slow" if scores['motion_score'] < 0.5 else "fast", "narrative": "none"},
                "geo": {"motion": "low" if scores['motion_score'] < 0.5 else "high"}
            }

    def get_high_level_recommendation(self, video_description, attention_scores):
        """Decides between 'filter' and 'transition' (from notebook)."""
        prompt = f"""You are an AI video editor. Based on the video description and attention scores, should you recommend a TRANSITION to improve flow or a FILTER to improve mood?

        VIDEO DESCRIPTION: {video_description}
        ATTENTION SCORES:
        - Emotion Score: {attention_scores['emotion_score']:.2f}
        - Motion Score: {attention_scores['motion_score']:.2f}
        - Overall Attention: {attention_scores['overall_attention']:.2f}

        DECISION: If motion or overall attention is low, a TRANSITION is better. If emotion is low, a FILTER is better.

        RECOMMENDATION (one word: transition or filter):"""

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_new_tokens=5, pad_token_id=self.tokenizer.eos_token_id)
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

        if "transition" in generated_text: return "transition"
        if "filter" in generated_text: return "filter"
        # Fallback logic from notebook
        if attention_scores['motion_score'] < 0.5 or attention_scores['overall_attention'] < 0.5:
            return "transition"
        else:
            return "filter"

    def recommend_detailed_filter(self, properties, chunk_id=0):
        """Generates a specific filter recommendation."""
        scores = defaultdict(int)
        for name, props in FILTERS_DB.items():
            if properties['seo']['tone'] in props.get('seo_match', []): scores[name] += 2
            if properties['seo'].get('narrative') in props.get('seo_match', []): scores[name] += 3
            if properties['geo']['motion'] in props.get('geo_match', []): scores[name] += 1

        if not scores: return None
        best_name = max(scores, key=scores.get)
        reason = f"The video has a {properties['seo']['tone']} tone and {properties['geo']['motion']} motion. The '{best_name}' filter enhances this."
        return {"chunk": chunk_id, "type": "filter", "name": best_name, "reason": reason}

    def recommend_detailed_transition(self, properties, chunk_id=0):
        """Generates a specific transition recommendation."""
        # For a single video, we assume a transition would be for a conceptual cut.
        # This logic is simplified; in a multi-chunk system, it would compare two chunks.
        scores = defaultdict(int)
        for name, props in TRANSITIONS_DB.items():
            if properties['seo']['pacing'] == 'fast' and "high-energy" in props['tags']: scores[name] += 2
            if properties['seo']['pacing'] == 'slow' and "gentle" in props['tags']: scores[name] += 2

        if not scores: return None
        best_name = max(scores, key=scores.get)
        reason = f"The video has a {properties['seo']['pacing']} pace. A '{best_name}' transition would match this energy."
        return {"chunk": chunk_id, "type": "transition", "target_chunk": chunk_id + 1, "name": best_name, "reason": reason}

# ==============================================================================
# 5. MAIN EXECUTION
# ==============================================================================
def main():
    """End-to-end recommendation pipeline."""
    print("--- STEP 1: Analyzing video to generate description ---")
    if not os.path.exists(VIDEO_PATH):
        print(f"Error: Video file not found at '{VIDEO_PATH}'. Please update the path.")
        return

    # Authenticate with Hugging Face
    try:
        from huggingface_hub import login
        login(token=HF_TOKEN)
    except ImportError:
        print("huggingface_hub not found. Please install it: pip install huggingface_hub")
        return
    except Exception as e:
        print(f"Hugging Face login failed: {e}")
        return

    video_analyzer = VideoAnalyzer(hf_token=HF_TOKEN)
    description = video_analyzer.get_video_description(VIDEO_PATH)
    print(f"Video Description: {description}\n")

    print("--- STEP 2: Simulating attention scores ---")
    # In a real system, these would be calculated by other modules
    attention_scores = {
        'emotion_score': 0.45,
        'motion_score': 0.85, # Increased to reflect a backflip video
        'saliency_score': 0.60,
        'object_detection_score': 0.55,
        'overall_attention': 0.62 # Adjusted
    }
    print(f"Simulated Scores: {attention_scores}\n")

    recommender = EnhancementRecommender(hf_token=HF_TOKEN)

    print("--- STEP 3: Extracting structured properties from description ---")
    video_properties = recommender.analyze_properties_from_description(description, attention_scores)
    print(f"Extracted Properties: {json.dumps(video_properties, indent=2)}\n")

    print("--- STEP 4: Getting high-level recommendation (Filter or Transition) ---")
    high_level_choice = recommender.get_high_level_recommendation(description, attention_scores)
    print(f"High-level choice: {high_level_choice}\n")

    print("--- STEP 5: Generating detailed recommendation ---")
    if high_level_choice == "filter":
        final_recommendation = recommender.recommend_detailed_filter(video_properties)
    else: # transition
        final_recommendation = recommender.recommend_detailed_transition(video_properties)

    if final_recommendation is None:
        print("Could not generate a specific recommendation.")
        return

    print(f"Detailed Recommendation: {final_recommendation}\n")

    # Assemble and save the final JSON output
    output_data = {"modifications": [final_recommendation]}
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(output_data, f, indent=2)

    print(f"--- COMPLETE ---")
    print(f"Final recommendations saved to '{OUTPUT_FILE}'")

if __name__ == "__main__":
    main()
    # Clear GPU cache if needed
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

--- STEP 1: Analyzing video to generate description ---


preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Video Description: The scene takes place on a race track, where a group of people is gathered around a row of parked cars.Ъе. The cars are lined up, and some of them have numbers on them, indicating that they are likely race cars. The people are standing near the cars, possibly discussing or preparing for a race.

There are several individuals in the scene, with some standing closer to the cars and others further away. A few cars are parked in the foreground, while others are positioned further back in the scene. The overall atmosphere suggests an event or gathering related to racing or automotive enthusiasts.

--- STEP 2: Simulating attention scores ---
Simulated Scores: {'emotion_score': 0.45, 'motion_score': 0.85, 'saliency_score': 0.6, 'object_detection_score': 0.55, 'overall_attention': 0.62}



tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

--- STEP 3: Extracting structured properties from description ---
Extracted Properties: {
  "seo": {
    "tone": "action",
    "pacing": "fast",
    "narrative": "story_moment"
  },
  "geo": {
    "motion": "high"
  }
}

--- STEP 4: Getting high-level recommendation (Filter or Transition) ---
High-level choice: transition

--- STEP 5: Generating detailed recommendation ---
Detailed Recommendation: {'chunk': 0, 'type': 'transition', 'target_chunk': 1, 'name': 'Glitch', 'reason': "The video has a fast pace. A 'Glitch' transition would match this energy."}

--- COMPLETE ---
Final recommendations saved to 'modifications.json'


In [None]:
import torch, gc

def clear_gpu_memory():
    gc.collect()                         # Collect Python garbage
    torch.cuda.empty_cache()             # Release unused memory to PyTorch cache
    torch.cuda.ipc_collect()             # Collect inter-process memory (rare cases)

    # Optional: If you have models or tensors still in memory, delete them
    for obj in list(globals().values()):
        if torch.is_tensor(obj):
            del obj
        elif hasattr(obj, 'parameters') and callable(getattr(obj, 'parameters')):
            del obj

    gc.collect()
    torch.cuda.empty_cache()

clear_gpu_memory()
