In [1]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [203]:
# Load pretrained data
df = pd.read_csv("annotated_emotions_cleaned.csv")

In [204]:
texts, emotions, timestamps, labels, hate_snippets, speech_ranges, video_file_name = [], [], [], [], [], [], []

for _, row in df.iterrows():
    video_file_name.append(row['video_file_name']),
    texts.append(row['Texts'])
    emotions.append(row['emotion'])
    timestamps.append(row['Timestamps'])
    labels.append(row['label'])
    hate_snippets.append(row['hate_snippet'])
    speech_ranges.append(row['speech_ranges'])

In [205]:
# put in a dataframe
df = pd.DataFrame({
    'video_file_name': video_file_name,
    'speech_ranges': speech_ranges,
    'timestamp': timestamps,
    'text': texts,
    'emotion': emotions,
    'hate_snippet': hate_snippets,
    'label': labels
})

df.head(10)

Unnamed: 0,video_file_name,speech_ranges,timestamp,text,emotion,hate_snippet,label
0,hate_video_1.mp4,"[['00:00:12', '00:00:16']]","[['00:00:00', '00:00:02'], ['00:00:30', '00:00...","[' You', ' You', ' You', ' You']","['SAD', 'SAD', 'SAD', 'SAD']","[['00:00:34', '00:01:34']]",Hate
1,hate_video_2.mp4,"[['00:00:00', '00:00:55'], ['00:00:57', '00:01...","[['00:00:00', '00:00:11'], ['00:00:11', '00:00...","["" I like children, I like tables, I like nigg...","['HAPPY', 'HAPPY', 'HAPPY', 'HAPPY', 'HAPPY', ...","[['00:00:06', '00:02:06']]",Hate
2,non_hate_video_1.mp4,"[['00:00:08', '00:00:29'], ['00:00:30', '00:00...","[['00:00:00', '00:00:07'], ['00:00:07', '00:00...","[' I think we need to move.', ' What are you t...","['SAD', 'NEUTRAL', 'SAD', 'HAPPY', 'SAD', 'NEU...",,Non Hate
3,hate_video_3.mp4,"[['00:00:00', '00:02:38'], ['00:02:42', '00:03...","[['00:00:00', '00:00:02'], ['00:00:02', '00:00...","[' I should be must get through.', ' No, not t...","['SURPRISED', 'NEUTRAL', 'NEUTRAL', 'NEUTRAL',...","[['00:00:03', '00:01:40'], ['00:01:41', '00:03...",Hate
4,hate_video_4.mp4,"[['00:00:00', '00:00:10']]","[['00:00:00', '00:00:09']]","[' Fuck jeans, we lose money, we lose the real...",['NEUTRAL'],"[['00:00:00', '00:00:13']]",Hate
5,non_hate_video_3.mp4,"[['00:00:00', '00:01:55']]","[['00:00:00', '00:00:05'], ['00:00:05', '00:00...",[' In Lehigh County authorities released the d...,"['NEUTRAL', 'ANGRY', 'NEUTRAL', 'ANGRY', 'ANGR...",,Non Hate
6,non_hate_video_4.mp4,"[['00:00:00', '00:00:02']]",[],[],[],,Non Hate
7,hate_video_5.mp4,"[['00:00:00', '00:00:34']]","[['00:00:00', '00:00:05'], ['00:00:05', '00:00...","[' Hello, nigger, my old friend.', "" You've co...","['SAD', 'NEUTRAL', 'SAD', 'SAD', 'NEUTRAL', 'S...","[['00:00:00', '00:00:39']]",Hate
8,non_hate_video_5.mp4,"[['00:00:09', '00:00:19']]","[['00:00:00', '00:00:12'], ['00:00:12', '00:00...","[' Nice, real stylish.', ' Your sense before y...","['NEUTRAL', 'NEUTRAL', 'NEUTRAL']",,Non Hate
9,hate_video_6.mp4,"[['00:00:02', '00:00:32'], ['00:00:32', '00:00...","[['00:00:00', '00:00:11'], ['00:00:11', '00:00...",[' No riding today? No looting on the foot loc...,"['ANGRY', 'ANGRY', 'NEUTRAL', 'NEUTRAL', 'NEUT...","[['00:00:00', '00:01:13']]",Hate


In [206]:
# remove row with [] timestamps
df = df[df['timestamp'] != '[]']

In [207]:
df.head(10)

Unnamed: 0,video_file_name,speech_ranges,timestamp,text,emotion,hate_snippet,label
0,hate_video_1.mp4,"[['00:00:12', '00:00:16']]","[['00:00:00', '00:00:02'], ['00:00:30', '00:00...","[' You', ' You', ' You', ' You']","['SAD', 'SAD', 'SAD', 'SAD']","[['00:00:34', '00:01:34']]",Hate
1,hate_video_2.mp4,"[['00:00:00', '00:00:55'], ['00:00:57', '00:01...","[['00:00:00', '00:00:11'], ['00:00:11', '00:00...","["" I like children, I like tables, I like nigg...","['HAPPY', 'HAPPY', 'HAPPY', 'HAPPY', 'HAPPY', ...","[['00:00:06', '00:02:06']]",Hate
2,non_hate_video_1.mp4,"[['00:00:08', '00:00:29'], ['00:00:30', '00:00...","[['00:00:00', '00:00:07'], ['00:00:07', '00:00...","[' I think we need to move.', ' What are you t...","['SAD', 'NEUTRAL', 'SAD', 'HAPPY', 'SAD', 'NEU...",,Non Hate
3,hate_video_3.mp4,"[['00:00:00', '00:02:38'], ['00:02:42', '00:03...","[['00:00:00', '00:00:02'], ['00:00:02', '00:00...","[' I should be must get through.', ' No, not t...","['SURPRISED', 'NEUTRAL', 'NEUTRAL', 'NEUTRAL',...","[['00:00:03', '00:01:40'], ['00:01:41', '00:03...",Hate
4,hate_video_4.mp4,"[['00:00:00', '00:00:10']]","[['00:00:00', '00:00:09']]","[' Fuck jeans, we lose money, we lose the real...",['NEUTRAL'],"[['00:00:00', '00:00:13']]",Hate
5,non_hate_video_3.mp4,"[['00:00:00', '00:01:55']]","[['00:00:00', '00:00:05'], ['00:00:05', '00:00...",[' In Lehigh County authorities released the d...,"['NEUTRAL', 'ANGRY', 'NEUTRAL', 'ANGRY', 'ANGR...",,Non Hate
7,hate_video_5.mp4,"[['00:00:00', '00:00:34']]","[['00:00:00', '00:00:05'], ['00:00:05', '00:00...","[' Hello, nigger, my old friend.', "" You've co...","['SAD', 'NEUTRAL', 'SAD', 'SAD', 'NEUTRAL', 'S...","[['00:00:00', '00:00:39']]",Hate
8,non_hate_video_5.mp4,"[['00:00:09', '00:00:19']]","[['00:00:00', '00:00:12'], ['00:00:12', '00:00...","[' Nice, real stylish.', ' Your sense before y...","['NEUTRAL', 'NEUTRAL', 'NEUTRAL']",,Non Hate
9,hate_video_6.mp4,"[['00:00:02', '00:00:32'], ['00:00:32', '00:00...","[['00:00:00', '00:00:11'], ['00:00:11', '00:00...",[' No riding today? No looting on the foot loc...,"['ANGRY', 'ANGRY', 'NEUTRAL', 'NEUTRAL', 'NEUT...","[['00:00:00', '00:01:13']]",Hate
10,hate_video_7.mp4,"[['00:00:00', '00:00:07'], ['00:00:08', '00:00...","[['00:00:00', '00:00:06'], ['00:00:06', '00:00...","['� Breathe deep in!', ' Oh God!', ' Oh, give ...","['HAPPY', 'SAD', 'SAD', 'NEUTRAL']","[['00:00:00', '00:00:26']]",Hate


In [208]:
import ast

df['speech_ranges'] = df['speech_ranges'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['hate_snippet'] = df['hate_snippet'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['timestamp'] = df['timestamp'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['text'] = df['text'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['emotion'] = df['emotion'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [209]:
df.head(10)

Unnamed: 0,video_file_name,speech_ranges,timestamp,text,emotion,hate_snippet,label
0,hate_video_1.mp4,"[[00:00:12, 00:00:16]]","[[00:00:00, 00:00:02], [00:00:30, 00:00:32], [...","[ You, You, You, You]","[SAD, SAD, SAD, SAD]","[[00:00:34, 00:01:34]]",Hate
1,hate_video_2.mp4,"[[00:00:00, 00:00:55], [00:00:57, 00:01:11], [...","[[00:00:00, 00:00:11], [00:00:11, 00:00:16], [...","[ I like children, I like tables, I like nigge...","[HAPPY, HAPPY, HAPPY, HAPPY, HAPPY, HAPPY, HAP...","[[00:00:06, 00:02:06]]",Hate
2,non_hate_video_1.mp4,"[[00:00:08, 00:00:29], [00:00:30, 00:00:34], [...","[[00:00:00, 00:00:07], [00:00:07, 00:00:10], [...","[ I think we need to move., What are you talk...","[SAD, NEUTRAL, SAD, HAPPY, SAD, NEUTRAL, NEUTR...",,Non Hate
3,hate_video_3.mp4,"[[00:00:00, 00:02:38], [00:02:42, 00:03:31]]","[[00:00:00, 00:00:02], [00:00:02, 00:00:04], [...","[ I should be must get through., No, not too ...","[SURPRISED, NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL...","[[00:00:03, 00:01:40], [00:01:41, 00:03:27]]",Hate
4,hate_video_4.mp4,"[[00:00:00, 00:00:10]]","[[00:00:00, 00:00:09]]","[ Fuck jeans, we lose money, we lose the real ...",[NEUTRAL],"[[00:00:00, 00:00:13]]",Hate
5,non_hate_video_3.mp4,"[[00:00:00, 00:01:55]]","[[00:00:00, 00:00:05], [00:00:05, 00:00:12], [...",[ In Lehigh County authorities released the di...,"[NEUTRAL, ANGRY, NEUTRAL, ANGRY, ANGRY, ANGRY,...",,Non Hate
7,hate_video_5.mp4,"[[00:00:00, 00:00:34]]","[[00:00:00, 00:00:05], [00:00:05, 00:00:10], [...","[ Hello, nigger, my old friend., You've come ...","[SAD, NEUTRAL, SAD, SAD, NEUTRAL, SAD, NEUTRAL]","[[00:00:00, 00:00:39]]",Hate
8,non_hate_video_5.mp4,"[[00:00:09, 00:00:19]]","[[00:00:00, 00:00:12], [00:00:12, 00:00:17], [...","[ Nice, real stylish., Your sense before you ...","[NEUTRAL, NEUTRAL, NEUTRAL]",,Non Hate
9,hate_video_6.mp4,"[[00:00:02, 00:00:32], [00:00:32, 00:00:38], [...","[[00:00:00, 00:00:11], [00:00:11, 00:00:21], [...",[ No riding today? No looting on the foot lock...,"[ANGRY, ANGRY, NEUTRAL, NEUTRAL, NEUTRAL, NEUT...","[[00:00:00, 00:01:13]]",Hate
10,hate_video_7.mp4,"[[00:00:00, 00:00:07], [00:00:08, 00:00:19], [...","[[00:00:00, 00:00:06], [00:00:06, 00:00:09], [...","[� Breathe deep in!, Oh God!, Oh, give me so...","[HAPPY, SAD, SAD, NEUTRAL]","[[00:00:00, 00:00:26]]",Hate


In [210]:
def to_seconds(t):
    try:
        return sum(int(x) * 60 ** i for i, x in enumerate(reversed(t.split(":"))))
    except:
        return -1

def are_ranges_nested(ranges1, ranges2):
    try:
        for r1 in ranges1:
            if len(r1) != 2:
                continue
            r1_start = to_seconds(r1[0])
            r1_end = to_seconds(r1[1])
            for r2 in ranges2:
                if len(r2) != 2:
                    continue
                r2_start = to_seconds(r2[0])
                r2_end = to_seconds(r2[1])
                # Check if r1 inside r2 or r2 inside r1
                if (r2_start <= r1_start <= r1_end <= r2_end) or (r1_start <= r2_start <= r2_end <= r1_end):
                    return True
        return False
    except:
        return False

In [211]:
df_clean = df[df['hate_snippet'].isna() | df.apply(lambda row: are_ranges_nested(row['speech_ranges'], row['hate_snippet']), axis=1)]

In [212]:
df_clean

Unnamed: 0,video_file_name,speech_ranges,timestamp,text,emotion,hate_snippet,label
1,hate_video_2.mp4,"[[00:00:00, 00:00:55], [00:00:57, 00:01:11], [...","[[00:00:00, 00:00:11], [00:00:11, 00:00:16], [...","[ I like children, I like tables, I like nigge...","[HAPPY, HAPPY, HAPPY, HAPPY, HAPPY, HAPPY, HAP...","[[00:00:06, 00:02:06]]",Hate
2,non_hate_video_1.mp4,"[[00:00:08, 00:00:29], [00:00:30, 00:00:34], [...","[[00:00:00, 00:00:07], [00:00:07, 00:00:10], [...","[ I think we need to move., What are you talk...","[SAD, NEUTRAL, SAD, HAPPY, SAD, NEUTRAL, NEUTR...",,Non Hate
3,hate_video_3.mp4,"[[00:00:00, 00:02:38], [00:02:42, 00:03:31]]","[[00:00:00, 00:00:02], [00:00:02, 00:00:04], [...","[ I should be must get through., No, not too ...","[SURPRISED, NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL...","[[00:00:03, 00:01:40], [00:01:41, 00:03:27]]",Hate
4,hate_video_4.mp4,"[[00:00:00, 00:00:10]]","[[00:00:00, 00:00:09]]","[ Fuck jeans, we lose money, we lose the real ...",[NEUTRAL],"[[00:00:00, 00:00:13]]",Hate
5,non_hate_video_3.mp4,"[[00:00:00, 00:01:55]]","[[00:00:00, 00:00:05], [00:00:05, 00:00:12], [...",[ In Lehigh County authorities released the di...,"[NEUTRAL, ANGRY, NEUTRAL, ANGRY, ANGRY, ANGRY,...",,Non Hate
...,...,...,...,...,...,...,...
1062,non_hate_video_649.mp4,"[[00:00:00, 00:00:44]]","[[00:00:00, 00:00:06], [00:00:06, 00:00:10], [...",[ so appalling because the implication of wher...,"[ANGRY, ANGRY, NEUTRAL, NEUTRAL, ANGRY, NEUTRA...",,Non Hate
1063,non_hate_video_650.mp4,"[[00:00:12, 00:01:14], [00:01:24, 00:02:26], [...","[[00:00:00, 00:00:24], [00:00:24, 00:00:34], [...",[ If tomorrow tribulations came knowing I woul...,"[SAD, SAD, SAD, NEUTRAL, SAD, SAD, SAD, HAPPY,...",,Non Hate
1065,non_hate_video_651.mp4,"[[00:00:00, 00:00:13], [00:00:14, 00:01:11], [...","[[00:01:00, 00:01:04], [00:01:04, 00:01:07], [...","[ at the National Church., It wasn't the 3rd ...","[SAD, SAD, SAD, NEUTRAL, SAD, SAD, NEUTRAL, SA...",,Non Hate
1066,non_hate_video_652.mp4,"[[00:00:00, 00:01:02]]","[[00:00:00, 00:00:08], [00:00:08, 00:00:12], [...",[ Israelis are here on YouTube busy manipulati...,"[NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL, SAD, NEUT...",,Non Hate


In [221]:
df_clean = df_clean.drop(columns=['speech_ranges'])

In [246]:
from ast import literal_eval

# Utilitaires
def timestamp_to_seconds(t):
    h, m, s = map(int, t.split(":"))
    return h * 3600 + m * 60 + s

def is_hateful_segment(seg_start, seg_end, hate_snippets, threshold=2):
    if(not hate_snippets) or (not isinstance(hate_snippets, list)):
        return 0
    for hs in hate_snippets:
        hs_start, hs_end = map(timestamp_to_seconds, hs)
        # seuil de tolérance en secondes
        if seg_end + threshold > hs_start and seg_start - threshold < hs_end:
            return 1
    return 0

def safe_eval(val):
    if isinstance(val, str):
        return literal_eval(val)
    return val
# Préparation de la nouvelle table
processed_rows = []

for idx, row in df_clean.iterrows():
    timestamps = safe_eval(row['timestamp'])
    texts = safe_eval(row['text'])
    emotions = safe_eval(row['emotion'])
    hate_snippets = safe_eval(row['hate_snippet'])
    video_file_name = row['video_file_name']
    
    for (start, end), text, emotion in zip(timestamps, texts, emotions):
        seg_start = timestamp_to_seconds(start)
        seg_end = timestamp_to_seconds(end)
        label = is_hateful_segment(seg_start, seg_end, hate_snippets, threshold=2)
        
        processed_rows.append({
            "video_file_name": video_file_name,
            "text": text,
            "emotion": emotion,
            "start": start,
            "end": end,
            "label": label
        })

df_processed = pd.DataFrame(processed_rows)

In [247]:
# On construit un nouveau dataframe avec CONTEXTE : phrase précédente + actuelle + suivante
contextual_rows = []

for i in range(len(df_processed)):
    # Texte précédent (si disponible)
    prev_text = df_processed.iloc[i - 1]['text'] if i > 0 else ""
    # Texte actuel
    curr_text = df_processed.iloc[i]['text']
    # Texte suivant (si disponible)
    next_text = df_processed.iloc[i + 1]['text'] if i < len(df_processed) - 1 else ""

    # Concaténation avec tags pour que le modèle sache ce qui est quoi
    full_text = f"[PREV] {prev_text} [CURR] {curr_text} [NEXT] {next_text}"

    contextual_rows.append({
        "video_file_name": df_processed.iloc[i]['video_file_name'],
        "text_with_context": full_text,
        "emotion": df_processed.iloc[i]['emotion'],
        "label": df_processed.iloc[i]['label'],
        "start": df_processed.iloc[i]['start'],
        "end": df_processed.iloc[i]['end'],
    })

df_contextual = pd.DataFrame(contextual_rows)


In [253]:
# export to csv
df_contextual.to_csv("processed_contextual_data.csv", index=False)