Imports

In [None]:
!pip install ffmpeg-python
!pip install av
!pip install -q -U google-generativeai
!pip install --upgrade pip
!pip install --upgrade transformers datasets[audio] accelerate

In [None]:
import os
import cv2
import time
import torch
import random
import ffmpeg
import warnings
import numpy as np
import pandas as pd
from PIL import Image
from glob import glob
import soundfile as sf
from json import loads,dumps
#from pydub import AudioSegment
import matplotlib.pyplot as plt
from scipy.signal import resample
import typing_extensions as typing
from google.generativeai.types import HarmCategory, HarmBlockThreshold

In [None]:
# Extracting all required ids 

labels_df = pd.read_csv('/kaggle/input/youtube-data/all_unique_codes3.csv') 

transcriptions_df = pd.read_csv('/kaggle/input/youtube-data/native-language-transcriptions-complete.csv')    # replace with path to translated transcriptions for english audio transcriptions 
df = pd.merge(labels_df, transcriptions_df, left_on='Video link', right_on='Video Id')

df.rename(columns={'Primary Tag': 'Primary Label'}, inplace=True) 
df 

In [None]:
# Filtering out available ads and those that are either inappropriate or child directed

df = df[(df['Primary Label'] == 'inappropriate') | (df['Primary Label'] == 'child directed') | (df['Primary Label'] == 'irrelevant')]

In [None]:
# Extracting video ids and primary labels

video_ids = list(df['Video link'])
primary_labels = list(df['Primary Label'])
all_transcriptions = list(df['Transcription'])

In [None]:
len(video_ids)

In [None]:
# Extracting data from transcripts

transcriptions = []
lengths = []

for (i, id_) in enumerate(video_ids):
    transcriptions.append(all_transcriptions[i].split("chunks")[0])
    lengths.append(len(all_transcriptions[i].split("chunks")[0]))


In [None]:
import os 

available_ids = os.listdir('/kaggle/input/youtube-data/Ads/Ads') 
len(available_ids) 

In [None]:
key_ = '' # add key here 

In [None]:
import google.generativeai as genai 
os.environ["API_KEY"] = key_
genai.configure(api_key=os.environ["API_KEY"]) 

In [None]:
import torch
print(torch.cuda.is_available())

Define Model 

In [None]:
model = genai.GenerativeModel("gemini-1.5-flash", system_instruction="You are an expert content moderator.") 

In [None]:
class LLM_Output(typing.TypedDict):
    label: str
    language: list[str]
    response: str

Prompt 

In [None]:
prompt = """  A video can be considered inappropriate for children if it contains violent or graphic content (cartoonish, realistic, inclusion of gore  and blood),
  self-harm or suicide,
  elements of horror or creepy themes,
  loud and violent sounds,
  zombies, skeletons, scary masks, scary clowns, fire in dangerous settings,
  extremely distressing scenes (such as car crashes),
  extreme stunts (depictions of life endangering or high-risk physical activities),
  fight sports (such as boxing, wrestling, martial arts),
  first-person shooter and battle games (games intended for children over 13 should be considered inapproprite),
  weapons (such as firearms, explosives, swords, axes etc.),
  sexual content (innuendos, overt sexual behavior, nudity, sexual themes, sexual suggestiveness etc.),
  vulgar dancing (such as twerking etc.),
  discrimination (based on race/ethnicity/gender/ability),
  profane or explicit language (from mild words such as "idiot" and "stupid" to severe swearing, hate speech or signs),
  drinking, drugs, smoking, gambling, crude humour,
  inappropriate clothing (with significant skin exposure),
  significant romantic intimacy.

  Remember that this criteria is not exhaustive.

  You have been given an audio transcription. Based on both the audio transcription, is the video inappropriate?

  If it is, label it as "inappropriate".
  If it is not, label it as "appropriate".

  """


Define function

In [None]:
len(video_ids)

In [None]:
ids = []
predicted_labels = []
languages = []
responses = []
ground_truths = []
remaining = []

img_dir = '/kaggle/working/Images'
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

for i in range(len(video_ids)): 
    
    if video_ids[i] in available_ids:

        try:
            contents_of_ad = os.listdir('/kaggle/input/youtube-data/Ads/Ads/' + video_ids[i]) 
            contents_of_ad.remove('audio.mp3') 
            video_path = '/kaggle/input/youtube-data/Ads/Ads/' + video_ids[i] + '/' + contents_of_ad[0] 

            # Upload audio
            audio = transcriptions[i] 

            # Make inference
            try:
                response = model.generate_content([audio, prompt],
                                                  generation_config=genai.GenerationConfig(
                                                      response_mime_type="application/json",
                                                      response_schema=LLM_Output),
                                                  safety_settings={
                                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                                  })
            except Exception as e:
                print(f"Error making inference: {e}")
                remaining.append(video_ids[i])
                continue

            flag = False 
            count = 0 
            while flag == False and count < 5:  
                    
                # Wrap response.text access in try-except
                try:
                    print("\nCompleted for video number:", i, "\t", video_ids[i])
    
                    dictionary = loads(response.text)
                    print(dictionary)
                    
                    id_temp = video_ids[i] 
                    predicted_label_temp = dictionary['label'] 
                    languages_temp = dictionary['language'] 
                    response_temp = dictionary['response'] 
                    ground_truths_temp = primary_labels[i] 
    
                    ids.append(id_temp)
                    predicted_labels.append(predicted_label_temp)
                    languages.append(languages_temp)
                    responses.append(response_temp)
                    ground_truths.append(ground_truths_temp) 
                    flag = True 
                    
                except Exception as e:
                    print(f"Error processing response.text: {e}")
                    remaining.append(video_ids[i])
                    time.sleep(7)
                    count += 1 
                    continue

        except Exception as e:
            print(f"Unexpected error: {e}")
            remaining.append(video_ids[i])
            continue

        # Wait a bit to avoid exceeding rate limits
        time.sleep(20)

# At the end, print remaining videos
print("Remaining videos with errors:", remaining)

In [None]:
remaining 

In [None]:
for i in range(len(responses)): 
    print('True Label: ', ground_truths[i], '\tPrediction: ', predicted_labels[i]) 

In [None]:
predicted_labels 

In [None]:
new_df = pd.DataFrame({
    'Video Id': ids,
    'Primary Label': ground_truths,
    'Predicted Label': predicted_labels,
    'Response': responses, 
    'Languages': languages 
})

new_df.head() 

In [None]:
results_dir = '/kaggle/working/results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

In [None]:
new_df.to_csv('/kaggle/working/results/audio-transcriptions-only.csv', index=False)

In [None]:
# Changing to binary lists 

predictions = [1 if pred == 'inappropriate' else 0 for pred in predicted_labels] 
ground_truths = [1 if label == 'inappropriate' else 0 for label in ground_truths] 

In [None]:
# Obtaining classification report 
from sklearn.metrics import classification_report 

report = classification_report(ground_truths, predictions) 
print(report) 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ground_truths, predictions)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Appropriate', 'Inapproriate'], yticklabels=['Appropriate', 'Inapproriate'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()