# Imports

In [None]:
!pip install ffmpeg-python
!pip install av
!pip install -q -U google-generativeai
!pip install --upgrade pip
!pip install --upgrade transformers datasets[audio] accelerate
!pip install scenedetect

In [None]:
import os
import cv2
import time
import torch
import random
import ffmpeg
import warnings
import numpy as np
import pandas as pd
from PIL import Image
from glob import glob
import soundfile as sf
from json import loads,dumps
#from pydub import AudioSegment
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.signal import resample
import typing_extensions as typing
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from scenedetect import open_video, VideoStreamCv2, SceneManager
from scenedetect.detectors import ContentDetector

# Data Loading

In [None]:
# Setting up video directories 

true1dir = '/kaggle/input/video-classification-data/Videos/Videos/videos-true' 
true2dir = '/kaggle/input/video-classification-data/Videos/Videos/videos-true-2' 
kids_dir = '/kaggle/input/video-classification-data/made-for-kids/made-for-kids'

false1dir = '/kaggle/input/video-classification-data/Videos/Videos/videos-false' 
false2dir = '/kaggle/input/video-classification-data/Videos/Videos/videos-false-2' 
non_kids_dir = '/kaggle/input/video-classification-data/non-made-for-kids/non-made-for-kids'

In [None]:
# Functions to extract corresponding transcriptions and ground truths 

def get_transcriptions_and_paths(vids_dir, ground_truths, transcriptions_path): 

    ids = os.listdir(vids_dir) 

    labels_df = pd.DataFrame({ 
        'IDs': ids, 
        'Labels': [ground_truths] * len(ids) 
    }) 
    
    transcriptions_df = pd.read_csv(transcriptions_path) 
    
    # Merging dataframes 
    df = pd.merge(labels_df, transcriptions_df, left_on='IDs', right_on='Video Id') 

    # Extracting transcriptions 
    ids = list(df['IDs']) 
    paths = [vids_dir + '/' + id for id in ids] 
    all_transcriptions = list(df['Transcription']) 

    # Extracting data from transcripts 
    transcriptions = [] 
    lengths = [] 
    for (i, id_) in enumerate(ids): 
        transcriptions.append(all_transcriptions[i].split("chunks")[0]) 
        lengths.append(len(all_transcriptions[i].split("chunks")[0])) 

    return paths, transcriptions, [ground_truths] * len(ids) 

In [None]:
# Loading paths, transcriptions, and labels 

true1_paths, true1_transcriptions, true1_labels = get_transcriptions_and_paths(true1dir, 'Child Directed', '/kaggle/input/video-classification-data/Translated Transcriptions/Translated Transcriptions/true-1-translated-transcriptions.csv') 
true2_paths, true2_transcriptions, true2_labels = get_transcriptions_and_paths(true2dir, 'Child Directed', '/kaggle/input/video-classification-data/Translated Transcriptions/Translated Transcriptions/true-2-translated-transcriptions.csv') 
kids_paths, kids_transcriptions, kids_labels = get_transcriptions_and_paths(kids_dir, 'Child Directed', '/kaggle/input/video-classification-data/Translated Transcriptions/Translated Transcriptions/made-for-kids_translated_transcriptions.csv') 

false1_paths, false1_transcriptions, false1_labels = get_transcriptions_and_paths(false1dir, 'Not Child Directed', '/kaggle/input/video-classification-data/Translated Transcriptions/Translated Transcriptions/false-1-translated-transcriptions.csv') 
false2_paths, false2_transcriptions, false2_labels = get_transcriptions_and_paths(false2dir, 'Not Child Directed', '/kaggle/input/video-classification-data/Translated Transcriptions/Translated Transcriptions/false-2-translated-transcriptions.csv') 
non_kids_paths, non_kids_transcriptions, non_kids_labels = get_transcriptions_and_paths(non_kids_dir, 'Not Child Directed', '/kaggle/input/video-classification-data/Translated Transcriptions/Translated Transcriptions/non-made-for-kids_translated_transcriptions.csv') 

In [None]:
# Final combined list of paths and ground labels 

paths = [] 
transcriptions = [] 
primary_labels = [] 

paths.extend(false1_paths) 
paths.extend(true1_paths) 
paths.extend(false2_paths) 
paths.extend(true2_paths) 
paths.extend(non_kids_paths)
paths.extend(kids_paths) 

primary_labels.extend(false1_labels) 
primary_labels.extend(true1_labels) 
primary_labels.extend(false2_labels) 
primary_labels.extend(true2_labels) 
primary_labels.extend(non_kids_labels) 
primary_labels.extend(kids_labels) 

transcriptions.extend(false1_transcriptions) 
transcriptions.extend(true1_transcriptions) 
transcriptions.extend(false2_transcriptions) 
transcriptions.extend(true2_transcriptions) 
transcriptions.extend(non_kids_transcriptions) 
transcriptions.extend(kids_transcriptions) 

video_ids = [path.split('/')[-1] for path in paths]

In [None]:
print(len(paths), len(primary_labels), len(transcriptions), len(video_ids))

**Extracting Images** 

In [None]:
def detect_scenes(video_path, threshold = 30):
    """Detect scenes in a video and return scene start and end frames."""
    scene_list = []
    while len(scene_list) < 6 and threshold > 0:
        threshold //= 2
    
        video = open_video(video_path)
        scene_manager = SceneManager()
        scene_manager.add_detector(ContentDetector(threshold=threshold))
    
        scene_manager.detect_scenes(video)
        scene_list = scene_manager.get_scene_list()
    
    return scene_list


def get_top_n_longest_scenes(scene_list, n):
    '''Return the top n longest scenes with start and end frame indices.'''
    scene_durations = [(start, end - start) for start, end in scene_list]
    scene_durations.sort(key=lambda x: x[1], reverse=True)

    # Top n longest scenes with start and end frame indices
    longest_scenes = [(start, start + duration) for start, duration in scene_durations[:n]]
    return longest_scenes


def sort_scenes_by_frame(scenes_list):
    '''Sort scenes by their start frame number.'''
    sorted_scenes = sorted(scenes_list, key=lambda scene: scene[0].get_frames())
    return sorted_scenes


def get_num_grids(video_path):
    '''Get number of grids to be created'''
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    duration = total_frames / fps

    # Calculate number of grids based on the duration
    duration = round(duration, 2)
    if ((duration // 60) + 1) <= 5:
        return int(((duration // 60) + 1))
    else:
        return 5


def extract_k_frames_from_scene(video_path, scene, k):
    '''Extract k frames evenly spaced from each scene.'''
    # Extract frame numbers from scene start and end
    start_frame = scene[0].get_frames() + 1
    end_frame = scene[1].get_frames() - 1

    # Create k equally spaced frame indices within the scene's range
    frame_indices = np.linspace(start_frame, end_frame, k, dtype=int)
    
    cap = cv2.VideoCapture(video_path)
    frames = []

    # Extract frames from calculated indices
    for frame_no in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
        ret, frame = cap.read()
        if ret:
            frames.append(frame)
    
    cap.release()
    return frames


def create_image_grid(frames, grid_size=(1000, 1000)):
    '''Arrange 6 frames into a 3x2 grid and resize to the specified grid size.'''
    # Ensure all frames have the same size for concatenation
    frames = [cv2.resize(frame, (640, 360)) for frame in frames]  # Resize to a common size like 640x360
    rows = [np.concatenate(frames[i:i+2], axis=1) for i in range(0, 6, 2)]
    image_grid = np.concatenate(rows, axis=0)
    
    return np.array(Image.fromarray(image_grid).resize(grid_size))

In [None]:
def get_images(video_path, n=6):
    ''' 1. Detect scenes
        2. Get k; where k = num_grids
        3. Get the 6 longest scenes
        4. Sort scenes wrt frame numbers
        5. Extract n * k frames
        6. Create k image grids of n frames each
     '''
    scene_list = detect_scenes(video_path)
    k = get_num_grids(video_path)
    longest_scenes = get_top_n_longest_scenes(scene_list, n*k)
    scenes = sort_scenes_by_frame(longest_scenes)

    frames = []
    for scene in scenes:
        frames.extend(extract_k_frames_from_scene(video_path, scene, 1))

    grids = []
    for i in range(k):
        start_idx = i * n
        end_idx = start_idx + n
        grid_frames = frames[start_idx:end_idx]
        grid = create_image_grid(grid_frames, grid_size=(1000, 1000))
        grids.append(grid)

    return grids

**Defining Model** 

In [None]:
key_ = '' # add key here 

In [None]:
import google.generativeai as genai 
os.environ["API_KEY"] = key_ 
genai.configure(api_key=os.environ["API_KEY"]) 

In [None]:
model = genai.GenerativeModel("gemini-1.5-flash", system_instruction="You are an expert content moderator.") 

In [None]:
class LLM_Output(typing.TypedDict):
    label: str
    language: list[str]
    response: str

**Prompt** 

In [None]:
prompt = """
    A piece of content is child-directed if it meets any of these criteria:

    Designed for Children:
    The content is explicitly created with children as the intended audience, such as:
      - Educational videos for kids.
      - Child-friendly video games with cartoonish graphics and non-violent gameplay and story-telling
      - Animated movies/clips suitable for family viewing
      - Simple crafts, activities, or demonstrations aimed at children.

    Child-Appealing Elements:
    The content includes features commonly enjoyed by children, such as:
    - Colorful, cartoonish, or animated visuals (e.g., animals, anthropomorphic characters, fantasy creatures).
    - Light-hearted portrayals of themes like mischief or conflict, but without mature visuals, strong language, or adult humor.
    - Whimsical, or playful themes.
    - Non-violent and simplified gameplay or narratives.
    - Content with rhythmic elements (e.g., chants, exclamations, or songs appealing to young viewers).

    Important Considerations:
    - Content can be child-directed even if it also appeals to older audiences.
    - Gaming content is child-directed if it features family-friendly gameplay and cartoonish visuals.
    - Movie/TV clips are child-directed if rated G/PG.
    - Presence of mild peril or conflict is acceptable if presented appropriately.

    Exclusions:
    - Do not label content as "Child Directed" if it features significant:
      - Dark, mature, or violent themes.
      - Romantic subplots or adult humor.
      - Complex dialogue or advanced vocabulary inappropriate for children.
      - Songs that are not nursery rhymes or explicitly created for children.
    - Recipe, DIY, or instructional videos unless simplified for children (e.g., "Cooking for Kids").

    Instructions:
    Labeling:
    If the video meets the child-directed criteria label it as "Child Directed".
    If it does not meet these criteria label it as "Not Child Directed".

    Indicate the spoken language if any.
    Provide a brief justification explaining why the video is considered child-directed or not.

    Format the output in JSON.
"""

**Running Model on Dataset** 

In [None]:
ids = []
predicted_labels = []
languages = []
responses = []
ground_truths = []
remaining = []

img_dir = '/kaggle/working/Images'
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

for i in range(len(paths)): 
    try:
        contents_of_ad = os.listdir(paths[i]) 
        contents_of_ad.remove('audio.mp3') 
        video_path = paths[i] + '/' + contents_of_ad[0] 
        audio_path = paths[i] + '/audio.mp3' 

        # Extract multiple images representative of the video
        images = get_images(video_path) 

        # Save each image returned by extract_images_of_frames
        image_paths = []
        for idx, img in enumerate(images):
            # Convert NumPy array to PIL image
            image = Image.fromarray(img)
            
            # Save the image to a file
            image_name = f"{video_ids[i]}_{idx + 1}.png"
            image_path = os.path.join(img_dir, image_name)
            image.save(image_path)
            image_paths.append(image_path)
            
        # Display the images
        fig, axes = plt.subplots(1, len(images), figsize=(10, 3))
        if len(images) == 1:
            axes.imshow(images[0])
            axes.axis('off')
        else:
            for ax, img in zip(axes, images):
                ax.imshow(img)
                ax.axis('off')

        plt.tight_layout()
        plt.show()

        # Upload images and handle potential errors
        uploaded_files = []
        try:
            for image_path in image_paths:
                uploaded_file = genai.upload_file(path=image_path, resumable=False)
                uploaded_files.append(uploaded_file)
        except Exception as e:
            print(f"Error uploading images: {e}")
            remaining.append(video_ids[i])
            continue

        # Check if all images have uploaded
        try:
            for uploaded_file in uploaded_files:
                while uploaded_file.state.name == "PROCESSING":
                    print('.', end='')
                    time.sleep(10)
                    uploaded_file = genai.get_file(uploaded_file.name)
                if uploaded_file.state.name == "FAILED":
                    raise ValueError(uploaded_file.state.name)
        except Exception as e:
            print(f"Error during image processing: {e}")
            remaining.append(video_ids[i])
            continue

        # Make inference with audio and image URIs
        audio = transcriptions[i]
        try: 
            inputs_ = [audio] 
            inputs_.extend(uploaded_files) 
            inputs_.extend([prompt]) 
            response = model.generate_content(inputs_,
                                              generation_config=genai.GenerationConfig(
                                                  response_mime_type="application/json",
                                                  response_schema=LLM_Output, 
                                                  temperature=0.0), 
                                              safety_settings={
                                                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                                                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                              })
        except Exception as e:
            print(f"Error making inference: {e}")
            remaining.append(video_ids[i])
            continue

        # Wrap response.text access in try-except
        try:
            print("Completed for video number:", i, ' ', video_ids[i])

            dictionary = loads(response.text)
            print('True Label:', primary_labels[i], 'Response:', dictionary)

            ids.append(video_ids[i])
            predicted_labels.append(dictionary['label'])
            languages.append(dictionary['language'])
            responses.append(dictionary['response'])
            ground_truths.append(primary_labels[i]) 
            
        except Exception as e:
            print(f"Error processing response.text: {e}")
            remaining.append(video_ids[i])
            continue

    except Exception as e:
        print(f"Unexpected error: {e}")
        remaining.append(video_ids[i])
        continue

    time.sleep(20)

In [None]:
len(ids)

In [None]:
# At the end, print remaining videos 

print("Remaining videos with errors:", remaining) 

In [None]:
for i in range(len(responses)): 
    print('True Label: ', ground_truths[i], '\tPrediction: ', predicted_labels[i]) 

In [None]:
new_df = pd.DataFrame({
    'Video Id': ids,
    'Primary Label': ground_truths,
    'Predicted Label': predicted_labels,
    'Response': responses, 
    'Languages': languages 
})

new_df.head() 

In [None]:
results_dir = '/kaggle/working/results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

In [None]:
new_df.to_csv('/kaggle/working/results/DAVSP_Gemini-1.5-Flash.csv', index=False)

In [None]:
# Changing to binary lists 

predictions = [1 if pred == 'Not Child Directed' else 0 for pred in predicted_labels] 
ground_truths = [1 if label == 'Not Child Directed' else 0 for label in ground_truths] 

In [None]:
# Obtaining classification report 
from sklearn.metrics import classification_report 

report = classification_report(ground_truths, predictions) 
print(report) 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ground_truths, predictions)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Child Directed', 'Child Directed'], yticklabels=['Not Child Directed', 'Child Directed'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()