Imports

In [None]:
!pip install openai
!pip install ffmpeg-python
!pip install av
!pip install scenedetect

In [None]:
import os
import cv2
import time
import json
import torch
import random
import ffmpeg
import warnings
import numpy as np
import pandas as pd
from PIL import Image
from glob import glob
import soundfile as sf
from openai import OpenAI
from json import loads,dumps
import matplotlib.pyplot as plt
#from pydub import AudioSegment
from scipy.signal import resample
import typing_extensions as typing
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from scenedetect import open_video, VideoStreamCv2, SceneManager
from scenedetect.detectors import ContentDetector

In [None]:
# Extracting all required ids

labels_df = pd.read_csv('/kaggle/input/youtube-data/all_unique_codes3.csv')
transcriptions_df = pd.read_csv('/kaggle/input/youtube-data/Translated-transcriptions.csv')

# Merging on different key names
df = pd.merge(labels_df, transcriptions_df, left_on='Video link', right_on='Video Id')
df.head() 

In [None]:
df = df[(df['Primary Tag'] == 'irrelevant') | (df['Primary Tag'] == 'inappropriate') | (df['Primary Tag'] == 'child directed')] 
df['Primary Tag'].value_counts() 

In [None]:
# Extracting video ids and primary labels

video_ids = list(df['Video link'])
primary_labels = list(df['Primary Tag'])
all_transcriptions = list(df['Transcription'])

In [None]:
len(video_ids)

In [None]:
# Extracting data from transcripts

transcriptions = []
lengths = []

for (i, id_) in enumerate(video_ids):
    transcriptions.append(all_transcriptions[i].split("chunks")[0])
    lengths.append(len(all_transcriptions[i].split("chunks")[0]))

In [None]:
import os 

available_ad_ids = os.listdir('/kaggle/input/youtube-data/Ads/Ads') 
len(available_ad_ids) 

**Mapping Ad Durations to Number of Images / Frames** 

In [None]:
durations_in_seconds = [] 
num_frames = [] 

for i in range(len(video_ids)): 

    contents_of_ad = os.listdir('/kaggle/input/youtube-data/Ads/Ads/' + video_ids[i]) 
    contents_of_ad.remove('audio.mp3') 
    video_path = '/kaggle/input/youtube-data/Ads/Ads/' + video_ids[i] + '/' + contents_of_ad[0] 

    cap = cv2.VideoCapture(video_path) 

    # Get the frames per second (fps) of the video
    fps = cap.get(cv2.CAP_PROP_FPS)
    # Get the total number of frames in the video
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    # Calculate the duration in seconds
    duration = total_frames / fps 
    
    durations_in_seconds.append(round(duration, 2)) 
    num_frames.append(total_frames) 

In [None]:
# Calculating number of images needed for each ad 

# < 1 minute: 1 image 
# 1 - 2 minutes: 2 images 
# 2 - 3 minutes: 3 images 
# 3 - 4 minutes: 4 images 
# > 4 minutes: 5 images 

num_images = [int(((x // 60) + 1)) if ((x // 60) + 1) <= 5 else 5 for x in durations_in_seconds] 

In [None]:
from collections import Counter

# Count the frequency of each number
frequency = Counter(num_images)
values = list(frequency.keys())
counts = list(frequency.values())

plt.bar(values, counts)

plt.xlabel("Number of Grids for Ad")
plt.ylabel("Frequency")
plt.title("Number of grids for each ad")

# Show the plot
plt.show()

**Defining Model** 

In [None]:
apikey = "" # add key here 

In [None]:
client = OpenAI(api_key=apikey) 

**Prompt** 

In [None]:
prompt = """ A video can be considered inappropriate for children if it contains physical violence (cartoonish, realistic, inclusion of gore),
  interpersonal violence (such as bullying, pranks, meanness, humor at expense of others, belittling ,controlling, talking down to, or manipulating others),
  self-harm or suicide,
  elements of scariness (horror, scary themes, creepy or suspenseful themes, loud and violent content, zombies, skeletons, scary masks, scary clowns, blood, fire in dangerous settings,
  mentally distressing scenes (car crashes, medical procedures etc),
  extreme stunts (depictions of life endangering or high-risk physical activities/challenges),
  sexual content (innuendos, overt sexual behavior, nudity, sexual themes, sexual suggestiveness),
  discrimination (based on race/ethnicity/gender/ability),
  offensive language (from mild words such as "idiot" and "stupid" to severe swearing, hate speech or signs),
  drinking, drugs, smoking, gambling, crude humour,
  inappropriate clothing (with significant skin exposure),
  death,
  intimacy (sensual physical contact, marriage, romance, dating and relationships, apparent romantic conduct between subjects/actors).
  romantic songs and music.

  You have been given a image grid (3 x 2) representing a video and it's audio transcription.
  Consider that you are a Parent. Based on both the visual content and audio transcription, is the video ad provided inappropriate for children under 9 based on this definition?
  
  If it is, label it as "inappropriate".
  If it is not, label it as "appropriate".

  Justify your conclusion.

  Also, mention what language is being spoken if there is any.

  Make the outputs in JSON format.

  """

**Output Schema** 

In [None]:
output_schema = {
  "type": "object",
  "properties": {
    "label": {
      "type": "string",
      "enum": ["inappropriate", "appropriate"]
    },
    "justification": {
      "type": "string",
      "minLength": 10
    }, 
    "languages": {
        "type": "array", 
        "items": {
            "type": "string" 
        }
        
    }
  },
  "required": ["label", "justification", "languages"]
} 

**Setting Up Image Data** 

In [None]:
import base64
import requests

def encode_image(image_path):
    """
    Encodes an image at the given path to a base64 string.
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def classify_video_with_images(text_input, audio_transcription, image_paths):
    """
    Sends text, audio transcription, and multiple images to the API for classification.
    
    Parameters:
    - text_input: str, the input text.
    - audio_transcription: str, transcription of the audio.
    - image_paths: list of str, paths to the images.

    Returns:
    - response: The API response.
    """
    # Encode all images to base64
    encoded_images = [
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(path)}"}}
        for path in image_paths
    ]

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {apikey}"
    }

    payload = {
        "model": "gpt-4o-2024-08-06",
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": text_input},
                    {"type": "text", "text": 'Audio transcription: ' + audio_transcription},
                ] + encoded_images  # Append all encoded images to the message content
            }
        ],
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "output_schema",
                "schema": output_schema
            }
        },
        "max_tokens": 300
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    return response


In [None]:
def detect_scenes(video_path, threshold = 30):
    """Detect scenes in a video and return scene start and end frames."""
    scene_list = []
    while len(scene_list) < 6 and threshold > 0:
        threshold //= 2
    
        video = open_video(video_path)
        scene_manager = SceneManager()
        scene_manager.add_detector(ContentDetector(threshold=threshold))
    
        scene_manager.detect_scenes(video)
        scene_list = scene_manager.get_scene_list()
    
    return scene_list


def get_top_n_longest_scenes(scene_list, n):
    '''Return the top n longest scenes with start and end frame indices.'''
    scene_durations = [(start, end - start) for start, end in scene_list]
    scene_durations.sort(key=lambda x: x[1], reverse=True)

    # Top n longest scenes with start and end frame indices
    longest_scenes = [(start, start + duration) for start, duration in scene_durations[:n]]
    return longest_scenes

def sort_scenes_by_frame(scenes_list):
    '''Sort scenes by their start frame number.'''
    sorted_scenes = sorted(scenes_list, key=lambda scene: scene[0].get_frames())
    return sorted_scenes


def get_num_grids(video_path):
    '''Get number of grids to be created'''
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    duration = total_frames / fps

    # Calculate number of grids based on the duration
    duration = round(duration, 2)
    if ((duration // 60) + 1) <= 5:
        return int(((duration // 60) + 1))
    else:
        return 5

def extract_k_frames_from_scene(video_path, scene, k):
    '''Extract k frames evenly spaced from each scene.'''
    # Extract frame numbers from scene start and end
    start_frame = scene[0].get_frames() + 1
    end_frame = scene[1].get_frames() - 1

    # Create k equally spaced frame indices within the scene's range
    frame_indices = np.linspace(start_frame, end_frame, k, dtype=int)
    
    cap = cv2.VideoCapture(video_path)
    frames = []

    # Extract frames from calculated indices
    for frame_no in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
        ret, frame = cap.read()
        if ret:
            frames.append(frame)
    
    cap.release()
    return frames


def create_image_grid(frames, grid_size=(1000, 1000)):
    '''Arrange 6 frames into a 3x2 grid and resize to the specified grid size.'''
    # Ensure all frames have the same size for concatenation
    frames = [cv2.resize(frame, (640, 360)) for frame in frames]  # Resize to a common size like 640x360
    rows = [np.concatenate(frames[i:i+2], axis=1) for i in range(0, 6, 2)]
    image_grid = np.concatenate(rows, axis=0)
    
    return np.array(Image.fromarray(image_grid).resize(grid_size))

In [None]:
def get_images(video_path, n=6):
    ''' 1. Detect scenes
        2. Get k; where k = num_grids
        3. Get the 6 longest scenes
        4. Sort scenes wrt frame numbers
        5. Extract n * k frames
        6. Create k image grids of n frames each
     '''
    scene_list = detect_scenes(video_path)
    k = get_num_grids(video_path)
    #k = 1 # For Single Grid of Major Scene Frames
    longest_scenes = get_top_n_longest_scenes(scene_list, n*k)
    scenes = sort_scenes_by_frame(longest_scenes)

    frames = []
    for scene in scenes:
        frames.extend(extract_k_frames_from_scene(video_path, scene, 1))

    grids = []
    for i in range(k):
        start_idx = i * n
        end_idx = start_idx + n
        grid_frames = frames[start_idx:end_idx]
        grid = create_image_grid(grid_frames, grid_size=(1000, 1000))
        grids.append(grid)

    return grids

**Running Model** 

In [None]:
ids = []
labels = []
responses = []
predicted_labels = [] 
languages = [] 
remaining = [] 

img_dir = '/kaggle/working/Images'
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

for i in range(len(video_ids)):

    if video_ids[i] in available_ad_ids:

        contents_of_ad = os.listdir('/kaggle/input/youtube-data/Ads/Ads/' + video_ids[i]) 
        contents_of_ad.remove('audio.mp3') 
        video_path = '/kaggle/input/youtube-data/Ads/Ads/' + video_ids[i] + '/' + contents_of_ad[0] 

        print('\n', i, video_path)
        
        try:
            # Extract multiple images representative of the video
            images = get_images(video_path)

            # Save each image returned by extract_images_of_frames
            image_paths = []
            for idx, img in enumerate(images):
                # Convert NumPy array to PIL image
                image = Image.fromarray(img)
                
                # Save the image to a file
                image_name = f"{video_ids[i]}_{idx + 1}.png"
                image_path = os.path.join(img_dir, image_name)
                image.save(image_path)
                image_paths.append(image_path)
                
            # Display the images
            fig, axes = plt.subplots(1, len(images), figsize=(10, 3))
            if len(images) == 1:
                axes.imshow(images[0])
                axes.axis('off')
            else:
                for ax, img in zip(axes, images):
                    ax.imshow(img)
                    ax.axis('off')

            plt.tight_layout()
            plt.show()

            audio_transcription = transcriptions[i] 

            try: 
                
                classification_response = classify_video_with_images(prompt, audio_transcription, image_paths)
            
                temp_id = video_ids[i] 
                temp_label = primary_labels[i] 
                temp_predicted_label = json.loads(classification_response.json()['choices'][0]['message']['content']).get('label') 
                temp_response = classification_response.json()['choices'][0]['message']['content'] 
                temp_languages = json.loads(classification_response.json()['choices'][0]['message']['content']).get('languages') 

                ids.append(temp_id)
                labels.append(temp_label)
                predicted_labels.append(temp_predicted_label)
                responses.append(temp_response)
                languages.append(temp_languages) 

                print('\nPrimary Label:', primary_labels[i], '. Response:', classification_response.json()['choices'][0]['message']['content'])

            except Exception as e: 
                print('\nClassification failed for ', i, 'Error:', str(e)) 
                remaining.append(video_ids[i])

        except Exception as e:
            print('\nImage extraction failed for ', i, 'Error:', str(e))
            remaining.append(video_ids[i])

        time.sleep(20) 

In [None]:
for x in remaining: 
    print('\'' + x + '\',') 

In [None]:
remaining 

In [None]:
for i in range(len(responses)): 
    print('True Label: ', labels[i], '\tPrediction: ', predicted_labels[i]) 

In [None]:
new_df = pd.DataFrame({
    'Video Id': ids,
    'Primary Label': labels,
    'Predicted Label': predicted_labels,
    'Languages': languages, 
    'Response': responses
})

new_df.head() 

In [None]:
results_dir = '/kaggle/working/results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

In [None]:
new_df.to_csv('/kaggle/working/results/DAVSP_GPT-4o.csv', index=False) 

In [None]:
# Changing to binary lists 

predictions = [1 if pred == 'inappropriate' else 0 for pred in predicted_labels] 
ground_truths = [1 if label == 'inappropriate' else 0 for label in labels] 

In [None]:
# Obtaining classification report 
from sklearn.metrics import classification_report 

report = classification_report(ground_truths, predictions) 
print(report) 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ground_truths, predictions)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Appropriate', 'Inapproriate'], yticklabels=['Appropriate', 'Inapproriate'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()