In [1]:
import os
import re
import glob
import json
import itertools
from typing import List
import pandas as pd
from lingua import LanguageDetectorBuilder, Language 
import instructor
from vertexai.generative_models import GenerativeModel 
from pydantic import BaseModel
from tqdm import tqdm

os.environ['GOOGLE_CLOUD_PROJECT'] = "terraform-training-387507"
os.environ['GOOGLE_CLOUD_LOCATION'] = "global"
os.environ['GOOGLE_GENAI_USE_VERTEXAI'] = "true"

In [None]:
data = pd.read_csv("../raw/lith_dataset_multi.csv")
ignored_cols = ['text']
data['labels'] = data.apply(lambda x: ','.join(k for k in x.keys() if k not in ignored_cols and x[k] == 1), axis=1)
selected = data['labels'].apply(lambda x: len(x) > 0) # Select only hate comments
data = data[selected]

os.makedirs('lthate', exist_ok=True)

In [3]:
prompt = """
You are an expert in hate speech detection and generation for Lithuanian language.
Below there are instances of hate speech comments. Each of them can have one or more labels assigned to it, seperated by comma. This corresponds to multilabel classification task.
```
{inputs}
```
For each comment, generate at least 5 comments which have the same semantic meaning and would have the same labels assigned to them. Use rude, foul and explicit language where applicable. 
Very long comments should be summarized or paraphrased into to shorter ones. The generated comments should vary in tone, severity, or lexical variation. 
Return them in JSON format
"""

In [4]:
class GenerateExamples(BaseModel):
    comment: str
    index: int
    label: List[str]
    generated_comments: List[str]


client = instructor.from_vertexai(
    client=GenerativeModel("gemini-2.5-flash"),
    mode=instructor.Mode.VERTEXAI_TOOLS,
)

In [5]:
batch_size = 100
start = 0
end = data.shape[0]
#end = 1500
with tqdm(total=end - start, initial=start) as pbar:
    for val in range(start, end, batch_size):
        from_ = val
        to_ = min(val + batch_size, end) 
        pbar.set_description(f"Processing {from_} - {to_}")
        inputs = "".join([f"""
    Comment: {x['text']}
    Index: {i} 
    Labels: {x["labels"]}
    """ for i, x in data[from_:to_].iterrows()])
        try:
            resp = client.create(
                messages=[{"role": "user", "content": prompt.format(inputs=inputs)}],
                response_model=List[GenerateExamples],
            )
            with open(f"lthate/index-{from_}-{to_}.json", "w", encoding='utf-8') as f:
                json.dump([json.loads(res.model_dump_json()) for res in resp], f, ensure_ascii=False)
        except Exception as e:
            print(f"Error while processing range {from_} - {to_}:", e.__str__())
        pbar.update(batch_size)
    

Processing 0 - 100:   0%|          | 0/1663 [00:00<?, ?it/s]

Processing 1600 - 1663: : 1700it [1:33:55,  3.31s/it]                        


In [6]:
files = glob.glob("lthate/*.json")

def extract_instances_df(instance_data):
    labels = instance_data['label']
    gen_instances = list(itertools.product(instance_data['generated_comments'], labels or [None]))
    instance_df = pd.DataFrame(data=gen_instances, columns=['text', 'target'])
    instance_df['value'] = 1 if len(labels) > 0 else 0
    return instance_df

def process_file(filename):
    with open(filename, "r", encoding='utf-8') as f:
        retrieved = json.load(f)
    result_df = pd.concat(list(map(extract_instances_df, retrieved)))
    return result_df

In [7]:
detector = LanguageDetectorBuilder.from_all_languages().build()
retrieved_df = pd.concat(list(map(process_file, files)))
retrieved_df['language'] = retrieved_df.apply(lambda x: detector.detect_language_of(x['text']), axis=1)
retrieved_df = retrieved_df[retrieved_df['language'] == Language.LITHUANIAN].drop(labels='language', axis=1)

In [8]:
def decensore(text):
    replacements = {
        'n***ui': 'nachui',
        'n**ui': 'nahui',
        'b**t': 'blet',
        'p**ti': 'pisti',
        'py*****': 'pyderai',
        'n*ger': 'niger',
        'š*d': 'šūd'
    }
    for replaced, replacement in replacements.items():
        text = re.sub(replaced.replace('*', r'\*'), replacement, text, flags=re.IGNORECASE)
    return text

retrieved_df['text'] = retrieved_df['text'].apply(decensore)
retrieved_df.to_csv("lthate/generated.csv", index=None)