In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
PATH = './data/ruddit.csv'
TRAIN_RATIO = 0.75
TEST_VAL_RATIO = 1

dataset = pd.read_csv(PATH)
x_train, x_test_valid, y_train, y_test_valid = train_test_split(dataset["comment_text"], dataset['offensiveness_score'] , train_size=TRAIN_RATIO, random_state=0)
x_test, x_valid, y_test, y_valid = train_test_split(x_test_valid, y_test_valid, test_size=TEST_VAL_RATIO, random_state=0)
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
x_valid = x_valid.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)
train_raw = pd.DataFrame({'text': x_train, 'score': y_train})
test_raw = pd.DataFrame({'text': x_test, 'score': y_test})
valid_raw = pd.DataFrame({'text': x_valid, 'score': y_valid})
train_raw['score'] = train_raw['score'].astype('float32')
test_raw['score'] = test_raw['score'].astype('float32')
valid_raw['score'] = valid_raw['score'].astype('float32')
del x_train, x_test, x_valid, y_train, y_test, y_valid, x_test_valid, y_test_valid
len(train_raw), len(test_raw), len(valid_raw)

In [None]:
wl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    pattern = re.compile('[^a-zA-Z]')
    text = pattern.sub(' ', text)
    text = text.lower()
    text = text.split()
    text = [wl.lemmatize(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

def clean_dataset(ds):
    ds['text'] = ds['text'].apply(preprocess_text)
    return ds

train = clean_dataset(train_raw.copy())
test = clean_dataset(test_raw.copy())
valid = clean_dataset(valid_raw.copy())

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

BASE_MODEL = "bert-base-cased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
BATCH_SIZE = 8
EPOCHS = 5

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)

In [None]:

import torch

class RudditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
train_encodings = tokenizer(list(train_raw['text']), truncation=True, padding=True)
val_encodings = tokenizer(list(valid_raw['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_raw['text']), truncation=True, padding=True)
train_dataset = RudditDataset(train_encodings, train['score'])
val_dataset = RudditDataset(val_encodings, valid['score'])
test_dataset = RudditDataset(test_encodings, test['score'])

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    
    # Compute accuracy 
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
    
    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./models/bert-cleaned-dataset-regression",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=10,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [None]:
from transformers import TrainingArguments, Trainer

import torch

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:

trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_for_regression,
)
trainer.train()

In [None]:
trainer.eval_dataset=test_dataset
trainer.evaluate()

In [None]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_for_regression,
)
trainer.train()

In [None]:
LLM_Models = ['GroNLP/hateBERT', 'unitary/unbiased-toxic-roberta', 'martin-ha/toxic-comment-model', 'unitary/toxic-bert']

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader
import torch

for model_name in LLM_Models:
    # BASE_MODEL = "bert-base-cased"
    print(f"Training model {model_name}")
    LEARNING_RATE = 2e-5
    MAX_LENGTH = 256
    BATCH_SIZE = 8
    EPOCHS = 7

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)


    train_encodings = tokenizer(list(train['text']), truncation=True, padding=True)
    val_encodings = tokenizer(list(valid['text']), truncation=True, padding=True)
    test_encodings = tokenizer(list(test['text']), truncation=True, padding=True)


    train_raw_encodings = tokenizer(list(train_raw['text']), truncation=True, padding=True)
    val_raw_encodings = tokenizer(list(valid_raw['text']), truncation=True, padding=True)
    test_raw_encodings = tokenizer(list(test_raw['text']), truncation=True, padding=True)


    train_dataset = RudditDataset(train_encodings, train['score'])
    val_dataset = RudditDataset(val_encodings, valid['score'])
    test_dataset = RudditDataset(test_encodings, test['score'])


    train_raw_dataset = RudditDataset(train_raw_encodings, train['score'])
    val_raw_dataset = RudditDataset(val_raw_encodings, valid['score'])
    test_raw_dataset = RudditDataset(test_raw_encodings, test['score'])



    training_args = TrainingArguments(
        output_dir=f"./models/{model_name}/-regression",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=10,
        metric_for_best_model="accuracy",
        load_best_model_at_end=True,
        weight_decay=0.01,
    )

    trainer = RegressionTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_for_regression,
    )
    trainer.train()
    trainer.eval_dataset=test_dataset
    trainer.evaluate()

    torch.cuda.empty_cache()

    training_args = TrainingArguments(
        output_dir=f"./models/{model_name}/-raw-regression",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=10,
        metric_for_best_model="accuracy",
        load_best_model_at_end=True,
        weight_decay=0.01,
    )

    trainer = RegressionTrainer(
        model=model,
        args=training_args,
        train_dataset=train_raw_dataset,
        eval_dataset=val_raw_dataset,
        compute_metrics=compute_metrics_for_regression,
    )
    trainer.train()
    trainer.eval_dataset=test_raw_dataset
    trainer.evaluate()
    torch.cuda.empty_cache()


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

BASE_MODEL = "./models/bert-raw-dataset-regression"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)

Loading the checkpoints: