In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
#libraries
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import re
import logging
import time
import polars as pl

#setup
logging.basicConfig(level=logging.INFO)

#group library
from label_legends.preprocess import holdout, transform, load_data
from label_legends.util import CONLL_DIR, ROOT, RESOURCE

In [4]:
def evaluate_baseline(true_labels, predictions, baseline_name):
    
    logging.info(f"Evaluating {baseline_name}...")
    
    #precision, recall, F1(macro avg)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average="macro")
    
    #confusion matrix
    cm = confusion_matrix(true_labels, predictions)
    
    # Print Evaluation Metrics
    print(f"\n{baseline_name} Evaluation Metrics:")
    print(f"Precision (Macro Avg): {precision:.4f}")
    print(f"Recall (Macro Avg): {recall:.4f}")
    print(f"F1 Score (Macro Avg): {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)

In [5]:
#loading the data
logging.info("Loading data...")
load_data().collect()

#transforming the data
val, tra = holdout()
logging.info("Transforming data...")
tra = transform(tra)
val = transform(val)

#converting 'label' column to integer
logging.info("Converting labels to integers...")
tra = tra.with_columns(
    pl.col("label").cast(pl.Int32)
)
val = val.with_columns(
    pl.col("label").cast(pl.Int32)
)

INFO:root:Loading data...
INFO:root:Transforming data...
INFO:root:Converting labels to integers...


In [6]:
tra

id,text,tokens,token_ids,label
i64,str,list[str],list[i64],i32
26016,"""#VoteDemOut seditionist wanna …","[""#"", ""votedemout"", … ""ven""]","[7, 3000, … 3000]",0
41060,"""Cerno hops from wave to wave: …","[""cerno"", ""hop"", … "".""]","[3000, 3000, … 26]",1
35766,"""think about it.. he´s called J…","[""think"", "".."", … ""elect""]","[2688, 27, … 911]",0
23678,"""Hmm, you could rewrite this wi…","[""hmm"", "","", … ""justsaying""]","[1284, 22, … 3000]",0
52396,"""Every girl in a game, or on si…","[""girl"", ""game"", … "".""]","[1164, 1135, … 26]",0
…,…,…,…,…
6785,"""I just think my dick couldn't …","[""just"", ""think"", … ""..😂😂😂""]","[1482, 2688, … 3000]",0
31355,"""no he deadasss hit the nail on…","[""deadass"", ""hit"", … "".""]","[3000, 1282, … 26]",0
49834,"""Aren't these goblins always wh…","[""goblins"", ""whine"", … ""hastagsarestillcool""]","[3000, 2914, … 3000]",0
26158,"""1. Stop calling girls, women a…","[""1."", ""stop"", … "".""]","[39, 2567, … 26]",0


In [12]:
#putting the text and label columns into lists
train_texts = tra['text'].to_list()
train_labels = tra['label'].to_list()

val_texts = val['text'].to_list()
val_labels = val['label'].to_list()

In [13]:
#starting the baseline
start_train_time = time.time()

def load_bad_words(file_path):
    """Load bad words from a .txt file, one word per line."""
    with open(file_path, 'r', encoding='utf-8') as file:
        bad_words = [line.strip() for line in file.readlines()]
    return bad_words

In [14]:
#loading the bad words - this will implicate whether a comment is sexist or not
bad_words = load_bad_words(RESOURCE / "compiled_bad_words.txt")
bad_words

#creating a regex pattern to match any bad word
bad_words_pattern = r'\b(' + '|'.join(re.escape(word) for word in bad_words) + r')\b'

#end of training
train_time_regex = time.time() - start_train_time

In [15]:
def classify_with_bad_words(text, pattern):
    """
    returns 1 if the text contains bad words, else 0.
    """
    return 1 if re.search(pattern, text, re.IGNORECASE) else 0

In [16]:
#predicting the labels and timing the test time
start_test_time = time.time()

#predictions for the validation set
val_pred_regex = [classify_with_bad_words(text, bad_words_pattern) for text in val_texts]

test_time_regex = time.time() - start_test_time

In [17]:
#evaluation
evaluate_baseline(val_labels, val_pred_regex, "Regex Baseline")
print(f"Training Time: {train_time_regex:.4f} seconds")
print(f"Test Time: {test_time_regex:.4f} seconds")

INFO:root:Evaluating Regex Baseline...



Regex Baseline Evaluation Metrics:
Precision (Macro Avg): 0.5782
Recall (Macro Avg): 0.6022
F1 Score (Macro Avg): 0.5606

Confusion Matrix:
[[5503 3873]
 [1233 1991]]
Training Time: 3.4549 seconds
Test Time: 6.8328 seconds
