# Algorithmic analysis of moderation

In this notebook we train a model to predict whether a song will be flagged as violating the openai policies. We use the lyrics of the song as input and the flag as the target. We use a pretrained BERT model and fine-tune it on our data. We use 10-fold cross-validation to evaluate the model.

We evaluate the model using precision, recall, and F1 score. We also create a random baseline to compare the model to. The random baseline predicts flagged with probability equal to the proportion of flagged songs in the training set.

We are following the [Hugging Face tutorial](https://huggingface.co/docs/transformers/training) for training a model on a custom dataset.

In [1]:
import sys

sys.path.append("../src")

In [2]:
import gc
import json
import zipfile
from datetime import datetime

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from sklearn.model_selection import KFold
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from utils.config import PROJECT_ROOT, DATA_DIR

In [3]:
EXP_DIR = PROJECT_ROOT / "experiments"
EXP_DATA_DIR = DATA_DIR / "experiments"

In [4]:
MODEL_NAME = "bert-base-multilingual-cased"
TRAIN_ARGS = TrainingArguments(
    output_dir=EXP_DIR / "results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
)

In [5]:
f1_score = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")


def compute_metrics(eval_pred):
    """
    Compute metric for a given evaluation prediction.
    """
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)

    results = {}
    for metric in (f1_score, precision, recall):
        results.update(metric.compute(predictions=predictions, references=labels))

    return results

In [6]:
def get_gen(df: pd.DataFrame):
    """
    Get a generator that yields the lyrics and labels for each song.
    """

    def gen():
        with zipfile.ZipFile(DATA_DIR / "lyrics.zip") as zf:
            for i, row in df.iterrows():
                lyrics = zf.read(f"{row['song_id']}.txt").decode("utf-8")
                yield {"text": lyrics, "labels": int(row["flagged"])}

    return gen

In [7]:
def get_ds(df: pd.DataFrame, tokenizer):
    """
    Get a dataset from a dataframe.
    """

    def tokenize(batch):
        return tokenizer(batch["text"], padding="max_length", truncation=True)

    ds = Dataset.from_generator(get_gen(df))
    ds = ds.map(tokenize).remove_columns(["text"])
    ds.set_format("torch")
    return ds

In [8]:
def load_model_and_tokenizer():
    """
    Load the model and tokenizer.
    """
    return (
        AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2),
        AutoTokenizer.from_pretrained(MODEL_NAME),
    )

In [9]:
def run_training(train_df: pd.DataFrame, valid_df: pd.DataFrame) -> dict:
    """
    Run the training and evaluation for one fold.
    """
    gc.collect()
    torch.cuda.empty_cache()

    model, tokenizer = load_model_and_tokenizer()
    train_dataset = get_ds(train_df, tokenizer)
    valid_dataset = get_ds(valid_df, tokenizer)

    trainer = Trainer(
        model=model,
        args=TRAIN_ARGS,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    trainer.save_model(EXP_DIR / "models" / datetime.now().isoformat())

    return trainer.evaluate(valid_dataset)

In [13]:
def main():
    """
    Run the training and evaluation for all folds.
    """
    df = pd.read_csv(DATA_DIR / "csv" / "moderation.csv")
    assert not df.song_id.duplicated().any(), "Duplicate song IDs found"

    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    for i, (train_index, valid_index) in enumerate(kf.split(df), 1):
        train_df = df.iloc[train_index]
        valid_df = df.iloc[valid_index]

        # bert training
        results = run_training(train_df, valid_df)
        results["model"] = MODEL_NAME

        with open(EXP_DATA_DIR / "exp_PD_001_AlgorithmicAnalysis.jsonl", "a") as f:
            f.write(json.dumps(results) + "\n")

        # random baseline
        p_flagged = train_df.flagged.mean()
        predictions = np.random.binomial(1, p_flagged, size=len(valid_df))
        results_baseline = {}
        for metric_fct in (f1_score, precision, recall):
            metric = metric_fct.compute(
                predictions=predictions, references=valid_df.flagged.values
            )
            results_baseline.update({f"eval_{key}": val for key, val in metric.items()})
        results_baseline["model"] = "random_baseline"

        with open(EXP_DATA_DIR / "exp_PD_001_AlgorithmicAnalysis.jsonl", "a") as f:
            f.write(json.dumps(results_baseline) + "\n")

In [14]:
%%capture

# run training and create baseline. Save results to jsonl file.
main()

In [31]:
# let's look at the results
results_df = pd.read_json(
    EXP_DATA_DIR / "exp_PD_001_AlgorithmicAnalysis.jsonl", lines=True
)
results_df = results_df[["model", "eval_f1", "eval_precision", "eval_recall"]]
results_df = results_df.groupby("model").agg(["mean", "std"]).transpose()
results_df

Unnamed: 0,model,bert-base-multilingual-cased,random_baseline
eval_f1,mean,0.662586,0.289544
eval_f1,std,0.016042,0.023108
eval_precision,mean,0.66456,0.29023
eval_precision,std,0.027976,0.02465
eval_recall,mean,0.663373,0.289267
eval_recall,std,0.041551,0.024524


In [32]:
results_df.reset_index().to_csv(EXP_DATA_DIR / "exp_PD_001_AlgorithmicAnalysis.csv")