# unifying the Datasets

In [2]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

class HASOCDataLoader:
    """Load and unify HASOC English and Hindi datasets"""

    def __init__(self, data_dir='.'):
        self.data_dir = Path(data_dir)
        self.english_df = None
        self.hindi_df = None

    def load_csv_file(self, filename):
        """Load CSV with error handling"""
        filepath = self.data_dir / filename
        try:
            df = pd.read_csv(filepath, encoding='utf-8')
            print(f"✓ Loaded {filename}: {len(df)} rows, {len(df.columns)} columns")
            return df
        except Exception as e:
            print(f"✗ Error loading {filename}: {e}")
            return None

    def load_tsv_file(self, filename):
        """Load TSV with error handling"""
        filepath = self.data_dir / filename
        try:
            df = pd.read_csv(filepath, sep='\t', encoding='utf-8')
            print(f"✓ Loaded {filename}: {len(df)} rows, {len(df.columns)} columns")
            return df
        except Exception as e:
            print(f"✗ Error loading {filename}: {e}")
            return None

    def load_xlsx_file(self, filename):
        """Load Excel file with error handling"""
        filepath = self.data_dir / filename
        try:
            df = pd.read_excel(filepath)
            print(f"✓ Loaded {filename}: {len(df)} rows, {len(df.columns)} columns")
            return df
        except Exception as e:
            print(f"✗ Error loading {filename}: {e}")
            return None

    def standardize_columns(self, df, year, split, language):
        """Standardize column names across different file formats"""

        # Common column name mappings
        text_cols = ['text', 'Text', 'tweet', 'Tweet', 'comment', 'post']
        label_cols = ['task_1', 'Task_1', 'task1', 'label', 'Label', 'class']

        # Find the text column
        text_col = None
        for col in text_cols:
            if col in df.columns:
                text_col = col
                break

        # Find the label column
        label_col = None
        for col in label_cols:
            if col in df.columns:
                label_col = col
                break

        # Create standardized dataframe
        standardized = pd.DataFrame()

        if text_col:
            standardized['text'] = df[text_col]
        else:
            print(f"  ⚠ Warning: No text column found in this file")
            standardized['text'] = None

        if label_col:
            standardized['task_1'] = df[label_col]
        else:
            print(f"  ⚠ Warning: No label column found")
            standardized['task_1'] = None

        # Add metadata
        standardized['year'] = year
        standardized['split'] = split
        standardized['language'] = language
        standardized['source_file'] = f"{language}_{year}_{split}"

        return standardized

    def load_english_datasets(self):
        """Load all English HASOC datasets"""
        print("\n" + "="*60)
        print("LOADING ENGLISH DATASETS")
        print("="*60)

        dfs = []

        # English 2019 - Task 1
        print("\n[English 2019]")
        en_2019_1 = self.load_tsv_file('english_2019_1.tsv')
        if en_2019_1 is not None:
            en_2019_1_std = self.standardize_columns(en_2019_1, 2019, 'train_1', 'en')
            dfs.append(en_2019_1_std)

        en_2019_2 = self.load_tsv_file('english_2019_2.tsv')
        if en_2019_2 is not None:
            en_2019_2_std = self.standardize_columns(en_2019_2, 2019, 'train_2', 'en')
            dfs.append(en_2019_2_std)

        # English 2020
        print("\n[English 2020]")
        en_2020 = self.load_xlsx_file('english_2020.xlsx')
        if en_2020 is not None:
            en_2020_std = self.standardize_columns(en_2020, 2020, 'train', 'en')
            dfs.append(en_2020_std)

        # English 2021
        print("\n[English 2021]")
        en_2021 = self.load_csv_file('english_2021.csv')
        if en_2021 is not None:
            en_2021_std = self.standardize_columns(en_2021, 2021, 'train', 'en')
            dfs.append(en_2021_std)

        # English 2021 Test
        en_2021_test = self.load_csv_file('en_Hasoc2021_test_task1.csv')
        if en_2021_test is not None:
            en_2021_test_std = self.standardize_columns(en_2021_test, 2021, 'test', 'en')
            dfs.append(en_2021_test_std)

        # Combine all English datasets
        if dfs:
            self.english_df = pd.concat(dfs, ignore_index=True)
            print(f"\n{'='*60}")
            print(f"✓ TOTAL ENGLISH SAMPLES: {len(self.english_df)}")
            print(f"{'='*60}")
        else:
            print("\n✗ No English datasets loaded")
            self.english_df = pd.DataFrame()

        return self.english_df

    def load_hindi_datasets(self):
        """Load all Hindi HASOC datasets"""
        print("\n" + "="*60)
        print("LOADING HINDI DATASETS")
        print("="*60)

        dfs = []

        # Hindi 2019 - Task 1
        print("\n[Hindi 2019]")
        hi_2019_1 = self.load_tsv_file('hindi_2019_1.tsv')
        if hi_2019_1 is not None:
            hi_2019_1_std = self.standardize_columns(hi_2019_1, 2019, 'train_1', 'hi')
            dfs.append(hi_2019_1_std)

        hi_2019_2 = self.load_tsv_file('hindi_2019_2.tsv')
        if hi_2019_2 is not None:
            hi_2019_2_std = self.standardize_columns(hi_2019_2, 2019, 'train_2', 'hi')
            dfs.append(hi_2019_2_std)

        # Hindi 2020
        print("\n[Hindi 2020]")
        hi_2020 = self.load_xlsx_file('hindi_2020.xlsx')
        if hi_2020 is not None:
            hi_2020_std = self.standardize_columns(hi_2020, 2020, 'train', 'hi')
            dfs.append(hi_2020_std)

        # Hindi 2021
        print("\n[Hindi 2021]")
        hi_2021 = self.load_csv_file('hindi_2021.csv')
        if hi_2021 is not None:
            hi_2021_std = self.standardize_columns(hi_2021, 2021, 'train', 'hi')
            dfs.append(hi_2021_std)

        # Hindi 2021 Test
        hi_2021_test = self.load_csv_file('hi_Hasoc2021_test_task1.csv')
        if hi_2021_test is not None:
            hi_2021_test_std = self.standardize_columns(hi_2021_test, 2021, 'test', 'hi')
            dfs.append(hi_2021_test_std)

        # Combine all Hindi datasets
        if dfs:
            self.hindi_df = pd.concat(dfs, ignore_index=True)
            print(f"\n{'='*60}")
            print(f"✓ TOTAL HINDI SAMPLES: {len(self.hindi_df)}")
            print(f"{'='*60}")
        else:
            print("\n✗ No Hindi datasets loaded")
            self.hindi_df = pd.DataFrame()

        return self.hindi_df

    def clean_data(self, df):
        """Clean and validate data"""
        print(f"\nCleaning data...")
        initial_count = len(df)

        # Remove duplicates
        df = df.drop_duplicates(subset=['text'], keep='first')
        print(f"  • Removed {initial_count - len(df)} duplicates")

        # Remove null texts
        df = df.dropna(subset=['text'])
        print(f"  • Removed samples with null text")

        # Strip whitespace
        df['text'] = df['text'].astype(str).str.strip()

        # Remove empty strings
        df = df[df['text'] != '']
        print(f"  • Final count: {len(df)} samples")

        return df

    def get_dataset_summary(self, df, name):
        """Print summary statistics"""
        print(f"\n{'='*60}")
        print(f"{name.upper()} DATASET SUMMARY")
        print(f"{'='*60}")
        print(f"Total samples: {len(df)}")

        if 'task_1' in df.columns and df['task_1'].notna().any():
            print(f"\nLabel distribution:")
            print(df['task_1'].value_counts())
            print(f"\nLabel percentages:")
            print(df['task_1'].value_counts(normalize=True).mul(100).round(2))

        if 'year' in df.columns:
            print(f"\nSamples by year:")
            print(df['year'].value_counts().sort_index())

        if 'split' in df.columns:
            print(f"\nSamples by split:")
            print(df['split'].value_counts())

        # Text length statistics
        df['text_length'] = df['text'].astype(str).str.len()
        print(f"\nText length statistics:")
        print(f"  Mean: {df['text_length'].mean():.1f} characters")
        print(f"  Median: {df['text_length'].median():.1f} characters")
        print(f"  Min: {df['text_length'].min()} characters")
        print(f"  Max: {df['text_length'].max()} characters")

        return df

    def load_all(self, clean=True):
        """Load all datasets and return both dataframes"""

        # Load English
        english_df = self.load_english_datasets()

        # Load Hindi
        hindi_df = self.load_hindi_datasets()

        # Clean if requested
        if clean and not english_df.empty:
            english_df = self.clean_data(english_df)

        if clean and not hindi_df.empty:
            hindi_df = self.clean_data(hindi_df)

        # Print summaries
        if not english_df.empty:
            english_df = self.get_dataset_summary(english_df, "English")

        if not hindi_df.empty:
            hindi_df = self.get_dataset_summary(hindi_df, "Hindi")

        return english_df, hindi_df

    def save_unified_datasets(self, output_dir='./unified_datasets'):
        """Save the unified datasets"""
        output_path = Path(output_dir)
        output_path.mkdir(exist_ok=True)

        if self.english_df is not None and not self.english_df.empty:
            en_path = output_path / 'hasoc_english_unified.csv'
            self.english_df.to_csv(en_path, index=False)
            print(f"\n✓ Saved English dataset to: {en_path}")

        if self.hindi_df is not None and not self.hindi_df.empty:
            hi_path = output_path / 'hasoc_hindi_unified.csv'
            self.hindi_df.to_csv(hi_path, index=False)
            print(f"✓ Saved Hindi dataset to: {hi_path}")


if __name__ == "__main__":
    # Initialize loader
    loader = HASOCDataLoader(data_dir='.')

    # Load all datasets
    english_df, hindi_df = loader.load_all(clean=True)

    # Save unified datasets
    loader.save_unified_datasets()

    # Access the dataframes
    print("\n" + "="*60)
    print("DATASETS READY FOR USE")
    print("="*60)
    print(f"english_df: {len(english_df)} samples")
    print(f"hindi_df: {len(hindi_df)} samples")
    print("\nColumns in datasets:")
    print(english_df.columns.tolist() if not english_df.empty else "No English data")


LOADING ENGLISH DATASETS

[English 2019]
✓ Loaded english_2019_1.tsv: 5852 rows, 5 columns
✓ Loaded english_2019_2.tsv: 1153 rows, 5 columns

[English 2020]
✓ Loaded english_2020.xlsx: 3708 rows, 5 columns

[English 2021]
✓ Loaded english_2021.csv: 3843 rows, 5 columns
✓ Loaded en_Hasoc2021_test_task1.csv: 1281 rows, 2 columns

✓ TOTAL ENGLISH SAMPLES: 15837

LOADING HINDI DATASETS

[Hindi 2019]
✓ Loaded hindi_2019_1.tsv: 1318 rows, 5 columns
✓ Loaded hindi_2019_2.tsv: 4665 rows, 5 columns

[Hindi 2020]
✓ Loaded hindi_2020.xlsx: 2963 rows, 5 columns

[Hindi 2021]
✓ Loaded hindi_2021.csv: 4594 rows, 6 columns
✓ Loaded hi_Hasoc2021_test_task1.csv: 1532 rows, 3 columns

✓ TOTAL HINDI SAMPLES: 15072

Cleaning data...
  • Removed 43 duplicates
  • Removed samples with null text
  • Final count: 15794 samples

Cleaning data...
  • Removed 316 duplicates
  • Removed samples with null text
  • Final count: 14756 samples

ENGLISH DATASET SUMMARY
Total samples: 15794

Label distribution:
task_1

In [3]:
hn = pd.read_csv('unified_datasets/hasoc_hindi_unified.csv')
en = pd.read_csv('unified_datasets/hasoc_english_unified.csv')

# Preprocessing the Datasets

In [5]:
hn

Unnamed: 0,text,task_1,year,split,language,source_file
0,"वक्त, इन्सान और इंग्लैंड का मौसम आपको कभी भी ध...",NOT,2019,train_1,hi,hi_2019_train_1
1,#कांग्रेस के इस #कमीने की #करतूत को देखिए देश ...,HOF,2019,train_1,hi,hi_2019_train_1
2,पाकिस्तान को फेकना था फेका गया। जो हार कर भी द...,HOF,2019,train_1,hi,hi_2019_train_1
3,जो शब्द तूम आज किसी और औरत के लिए यूज कर रहे व...,NOT,2019,train_1,hi,hi_2019_train_1
4,नेता जी हम समाजवादी सिपाही हमेशा आपके साथ है आ...,NOT,2019,train_1,hi,hi_2019_train_1
...,...,...,...,...,...,...
15067,@AcharyaPramodk @yadavakhilesh अंध भक्तो का वो...,,2021,test,hi,hi_2021_test
15068,बंगाल में पिछले 3 दिनों में हुई कुछ हत्याओं पर...,,2021,test,hi,hi_2021_test
15069,@Sohel__AK @ali_manihaar मुंशी प्रेमचंद ने खड्...,,2021,test,hi,hi_2021_test
15070,"लोगों को पेट्रोल सस्ता दो, चाहें फिर देश को ...",,2021,test,hi,hi_2021_test


In [4]:
en

Unnamed: 0,text,task_1,year,split,language,source_file
0,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,NOT,2019,train_1,en,en_2019_train_1
1,@politico No. We should remember very clearly ...,HOF,2019,train_1,en,en_2019_train_1
2,@cricketworldcup Guess who would be the winner...,NOT,2019,train_1,en,en_2019_train_1
3,Corbyn is too politically intellectual for #Bo...,NOT,2019,train_1,en,en_2019_train_1
4,All the best to #TeamIndia for another swimmin...,NOT,2019,train_1,en,en_2019_train_1
...,...,...,...,...,...,...
15832,@AJEnglish Shameless #Hinduphobia and #bigotry...,,2021,test,en,en_2021_test
15833,@DonVocero Real Motherfucker,,2021,test,en,en_2021_test
15834,Gunna was in cunt mode on DS3,,2021,test,en,en_2021_test
15835,"@roomorgue I adore all my hooker friends, incl...",,2021,test,en,en_2021_test


In [14]:
import re
import html

def clean_hasoc_text_v2(text):
    """
    Clean HASOC text while preserving expressive cues:
    - Keep username text
    - Keep hashtags text
    - Keep emojis and punctuation
    - Replace URLs with 'URL'
    """
    if not isinstance(text, str):
        return ""

    # Decode HTML entities
    text = html.unescape(text)

    # Replace URLs with a token
    text = re.sub(r"http\S+|www\.\S+", "URL", text)

    # Keep username text, remove only '@'
    text = re.sub(r"@(\w+)", r"\1", text)

    # Keep hashtags text, remove only '#'
    text = re.sub(r"#(\w+)", r"\1", text)

    # Remove HTML tags
    text = re.sub(r"<.*?>", " ", text)

    # Remove zero-width / invisible characters
    text = re.sub(r"[\u200B-\u200D\uFEFF]", "", text)

    # Normalize multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [15]:
# Assuming your HASOC DataFrames are `hasoc_en` and `hasoc_hi`
en['clean_text'] = en['text'].apply(clean_hasoc_text)
hn['clean_text'] = hn['text'].apply(clean_hasoc_text)


In [18]:
hn

Unnamed: 0,text,task_1,year,split,language,source_file,clean_text
0,"वक्त, इन्सान और इंग्लैंड का मौसम आपको कभी भी ध...",NOT,2019,train_1,hi,hi_2019_train_1,"वक्त, इन्सान और इंग्लैंड का मौसम आपको कभी भी ध..."
1,#कांग्रेस के इस #कमीने की #करतूत को देखिए देश ...,HOF,2019,train_1,hi,hi_2019_train_1,कांग्रेस के इस कमीने की करतूत को देखिए देश की ...
2,पाकिस्तान को फेकना था फेका गया। जो हार कर भी द...,HOF,2019,train_1,hi,hi_2019_train_1,पाकिस्तान को फेकना था फेका गया। जो हार कर भी द...
3,जो शब्द तूम आज किसी और औरत के लिए यूज कर रहे व...,NOT,2019,train_1,hi,hi_2019_train_1,जो शब्द तूम आज किसी और औरत के लिए यूज कर रहे व...
4,नेता जी हम समाजवादी सिपाही हमेशा आपके साथ है आ...,NOT,2019,train_1,hi,hi_2019_train_1,नेता जी हम समाजवादी सिपाही हमेशा आपके साथ है आ...
...,...,...,...,...,...,...,...
15067,@AcharyaPramodk @yadavakhilesh अंध भक्तो का वो...,,2021,test,hi,hi_2021_test,USER USER अंध भक्तो का वो अलम है हमारे देश में...
15068,बंगाल में पिछले 3 दिनों में हुई कुछ हत्याओं पर...,,2021,test,hi,hi_2021_test,बंगाल में पिछले 3 दिनों में हुई कुछ हत्याओं पर...
15069,@Sohel__AK @ali_manihaar मुंशी प्रेमचंद ने खड्...,,2021,test,hi,hi_2021_test,USER USER मुंशी प्रेमचंद ने खड्ग सिंह वाली कहा...
15070,"लोगों को पेट्रोल सस्ता दो, चाहें फिर देश को ...",,2021,test,hi,hi_2021_test,"लोगों को पेट्रोल सस्ता दो, चाहें फिर देश को बं..."


In [13]:
en

Unnamed: 0,text,task_1,year,split,language,source_file,clean_text
0,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,NOT,2019,train_1,en,en_2019_train_1,DhoniKeepsTheGlove | WATCH: Sports Minister Ki...
1,@politico No. We should remember very clearly ...,HOF,2019,train_1,en,en_2019_train_1,USER No. We should remember very clearly that ...
2,@cricketworldcup Guess who would be the winner...,NOT,2019,train_1,en,en_2019_train_1,USER Guess who would be the winner of this CWC...
3,Corbyn is too politically intellectual for #Bo...,NOT,2019,train_1,en,en_2019_train_1,Corbyn is too politically intellectual for Bor...
4,All the best to #TeamIndia for another swimmin...,NOT,2019,train_1,en,en_2019_train_1,All the best to TeamIndia for another swimming...
...,...,...,...,...,...,...,...
15832,@AJEnglish Shameless #Hinduphobia and #bigotry...,,2021,test,en,en_2021_test,USER Shameless Hinduphobia and bigotry - calli...
15833,@DonVocero Real Motherfucker,,2021,test,en,en_2021_test,USER Real Motherfucker
15834,Gunna was in cunt mode on DS3,,2021,test,en,en_2021_test,Gunna was in cunt mode on DS3
15835,"@roomorgue I adore all my hooker friends, incl...",,2021,test,en,en_2021_test,"USER I adore all my hooker friends, including ..."


In [21]:
import pandas as pd

en[['clean_text']].rename(columns={'clean_text': 'comment'}).to_csv(
    "hasoc_en_preprocessed.csv", index=False
)

hn[['clean_text']].rename(columns={'clean_text': 'comment'}).to_csv(
    "hasoc_hi_preprocessed.csv", index=False
)


In [22]:
hasoc_en = pd.read_csv('hasoc_en_preprocessed.csv')
hasoc_hi = pd.read_csv('hasoc_hi_preprocessed.csv')

In [23]:
hasoc_en

Unnamed: 0,comment
0,DhoniKeepsTheGlove | WATCH: Sports Minister Ki...
1,USER No. We should remember very clearly that ...
2,USER Guess who would be the winner of this CWC...
3,Corbyn is too politically intellectual for Bor...
4,All the best to TeamIndia for another swimming...
...,...
15832,USER Shameless Hinduphobia and bigotry - calli...
15833,USER Real Motherfucker
15834,Gunna was in cunt mode on DS3
15835,"USER I adore all my hooker friends, including ..."


In [24]:
hasoc_hi

Unnamed: 0,comment
0,"वक्त, इन्सान और इंग्लैंड का मौसम आपको कभी भी ध..."
1,कांग्रेस के इस कमीने की करतूत को देखिए देश की ...
2,पाकिस्तान को फेकना था फेका गया। जो हार कर भी द...
3,जो शब्द तूम आज किसी और औरत के लिए यूज कर रहे व...
4,नेता जी हम समाजवादी सिपाही हमेशा आपके साथ है आ...
...,...
15067,USER USER अंध भक्तो का वो अलम है हमारे देश में...
15068,बंगाल में पिछले 3 दिनों में हुई कुछ हत्याओं पर...
15069,USER USER मुंशी प्रेमचंद ने खड्ग सिंह वाली कहा...
15070,"लोगों को पेट्रोल सस्ता दो, चाहें फिर देश को बं..."


In [46]:
import pandas as pd
import google.generativeai as genai
import json
import time
from sklearn.metrics import accuracy_score, f1_score

In [44]:
from google.colab import userdata
genai.configure(api_key='api ')  # Replace with your actual key
model = genai.GenerativeModel('gemini-2.5-flash')  # Or 'gemini-1.5-pro' for higher quality (slower/more expensive)

# Load datasets
hasoc_en = pd.read_csv('hasoc_en_preprocessed.csv')
hasoc_hi = pd.read_csv('hasoc_hi_preprocessed.csv')

# Add language column if not present (for tracking in combined dataset)
if 'language' not in hasoc_en.columns:
    hasoc_en['language'] = 'en'
if 'language' not in hasoc_hi.columns:
    hasoc_hi['language'] = 'hi'


In [47]:



prompt_template = """
You are a toxicity classifier expert. The text may be in English, Hindi, or code-mixed (Hinglish). Analyze it based on its semantic meaning, overall intent, and impact, considering cultural nuances. Classify into these categories (multi-label: a text can have zero or more labels):

- toxic: rude, disrespectful, or unreasonable behavior that could make someone leave a discussion.
- severe_toxic: extremely hateful, aggressive, or vicious toxicity.
- obscene: contains foul language, swear words, or explicit content.
- threat: expresses intent to inflict pain, injury, damage, or harm.
- insult: demeans, disparages, or mocks a person or group.
- identity_hate: attacks or hates based on identity like race, religion, ethnicity, gender, orientation, etc.

Here are examples:

Example 1:
Text: "Hello, how are you?"
Output: {{"toxic": 0, "severe_toxic": 0, "obscene": 0, "threat": 0, "insult": 0, "identity_hate": 0}}

Example 2:
Text: "You fucking idiot, go die."
Output: {{"toxic": 1, "severe_toxic": 1, "obscene": 1, "threat": 1, "insult": 1, "identity_hate": 0}}

Example 3:
Text: "All Jews are greedy."
Output: {{"toxic": 1, "severe_toxic": 0, "obscene": 0, "threat": 0, "insult": 1, "identity_hate": 1}}

Example 4:
Text: "saari ladkiyan pagal hoti hai. Bilkul useless."
Output: {{"toxic": 1, "severe_toxic": 0, "obscene": 0, "threat": 0, "insult": 1, "identity_hate": 1}}

Example 5:
Text: "Bangaldeshi apne desh vapas jaa."
Output: {{"toxic": 1, "severe_toxic": 0, "obscene": 0, "threat": 0, "insult": 0, "identity_hate": 1}}

Example 6:
Text: "Tum bahut ache ho."
Output: {{"toxic": 0, "severe_toxic": 0, "obscene": 0, "threat": 0, "insult": 0, "identity_hate": 0}}

Output ONLY a JSON object with keys as the categories and values as 1 (yes) or 0 (no). No explanations or additional text.

Text: "{comment}"
"""

# Function to classify one comment
def classify_text(comment):
    prompt = prompt_template.format(comment=comment)
    try:
        response = model.generate_content(prompt)
        labels = json.loads(response.text.strip())
        return labels
    except Exception as e:
        print(f"Error classifying comment: {e}")
        return {cat: 0 for cat in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']}

# Function to process a DataFrame
def process_df(df):
    new_labels = []
    for i, row in df.iterrows():
        labels = classify_text(row['comment'])
        new_labels.append(labels)
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1} samples...")
            time.sleep(5)  # Adjust for rate limits
    labels_df = pd.DataFrame(new_labels)
    return pd.concat([df, labels_df], axis=1)

# Process each
hasoc_en_aug = process_df(hasoc_en)
hasoc_hi_aug = process_df(hasoc_hi)

# Save individually
hasoc_en_aug.to_csv('hasoc_en_augmented.csv', index=False)
hasoc_hi_aug.to_csv('hasoc_hi_augmented.csv', index=False)

# Combine
hasoc_combined = pd.concat([hasoc_en_aug, hasoc_hi_aug], ignore_index=True)
hasoc_combined.to_csv('hasoc_combined_augmented.csv', index=False)

# Validation: Compare derived binary label to original HASOC label (assume 'task_1' column exists with 'HOF'/'NOT')
if 'task_1' in hasoc_combined.columns:
    # Map original to binary: 'HOF' -> 1, 'NOT' -> 0 (adjust if your encoding differs)
    hasoc_combined['original_hof'] = hasoc_combined['task_1'].apply(lambda x: 1 if x == 'HOF' else 0)

    # Derive binary from new labels: 1 if any toxicity category is 1
    toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    hasoc_combined['derived_hof'] = hasoc_combined[toxicity_cols].max(axis=1)

    # Compute metrics
    acc = accuracy_score(hasoc_combined['original_hof'], hasoc_combined['derived_hof'])
    f1 = f1_score(hasoc_combined['original_hof'], hasoc_combined['derived_hof'])
    print(f"Validation Agreement: Accuracy = {acc:.2f}, F1-Score = {f1:.2f}")
    # Save with validation columns
    hasoc_combined.to_csv('hasoc_combined_augmented_with_validation.csv', index=False)
else:
    print("No 'task_1' column found for validation. Manually review a sample instead.")

print("Processing complete. Augmented files saved.")

KeyboardInterrupt: 