In [15]:
import sys
from pathlib import Path
ROOT_DIR = Path().resolve().parents[0]
sys.path.append(str(ROOT_DIR))
import config as cfg

import pandas as pd
from skmultilearn.model_selection import IterativeStratification
from transformers import AutoTokenizer
import numpy as np
from datasets import Dataset, load_from_disk
import os

# Load data

In [2]:
df_train = pd.read_csv(cfg.PATH_DF_TRAIN)
df_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [3]:
df_test = pd.read_csv(cfg.PATH_DF_TEST)
df_test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


# Preprocessing

## Stratification

In [4]:
for col in df_train.columns[2:-1]:
    print(df_train[col].value_counts(normalize=True))

toxic
0    0.904156
1    0.095844
Name: proportion, dtype: float64
severe_toxic
0    0.990004
1    0.009996
Name: proportion, dtype: float64
obscene
0    0.947052
1    0.052948
Name: proportion, dtype: float64
threat
0    0.997004
1    0.002996
Name: proportion, dtype: float64
insult
0    0.950636
1    0.049364
Name: proportion, dtype: float64


As showed above, the distribution of classes is highly unbalanced (many more 0s than 1s), but also very distant from one to another (toxic showing 10% positives but identity_hate less than 1%).

This makes mandatory to use a multi-label stratified KFold, as if we'd use a standard one, there is high chance than some folds wouldn't include any positive class from the less frequent labels.

In [5]:
X_idx = df_train.index.values
X_idx

array([     0,      1,      2, ..., 159568, 159569, 159570])

In [6]:
y = df_train.iloc[:, 2:-1].values
y

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [7]:
# Instantiate the stratified k-fold object
kfold = IterativeStratification(n_splits=cfg.N_FOLDS, order=1)     # 5 folds

In [8]:
# Initialize the fold column into -1
df_train["fold"] = -1
# Assign folds to the training set
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_idx, y)):
    df_train.loc[val_idx, "fold"] = fold
# Reorder the columns to have 'fold' as the second column
df_train.insert(1, "fold", df_train.pop("fold"))

In [9]:
# Check that the folds are assigned correctly
df_train.sample(10)

Unnamed: 0,id,fold,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
80522,d77187da7d19cb20,0,"""\n\nOrphaned fair use image (Image:Gord.jpg)\...",0,0,0,0,0,0
106930,3bcbf6d107b18cfa,0,""" don't pay any attention to other users. I ad...",0,0,0,0,0,0
64973,addd1b4531c59b87,4,"""\n\nYou know what, I appologize to you for be...",0,0,0,0,0,0
53411,8ecd9defaf39172d,0,Why shouldn't people discuss problems with a g...,0,0,0,0,0,0
151530,7eb81ceaf1cc4239,0,i think proffesor frink should but not the mafia,0,0,0,0,0,0
97616,0a375f5916bd81ab,4,"""This is completely absurd. Maybe this is the...",0,0,0,0,0,0
51600,8a05ac3212838c8f,1,If you can not organize your thoughts...just p...,0,0,0,0,0,0
50136,85fe0d38e8b63dd1,4,"""\n\nThanking you in anticipation, I added som...",0,0,0,0,0,0
27215,47fb7dba9d82817e,0,"""\n\n Popular Culture \n\n""""Read Wikipedia:Wik...",0,0,0,0,0,0
2533,06c4ba1c70839c27,3,"""\n\nI like the way you forget about all the t...",1,0,1,0,1,0


In [10]:
# Check that the folds were distributed evenly
df_train["fold"].value_counts(normalize=True)

fold
0    0.200005
4    0.200005
3    0.200005
1    0.200005
2    0.199980
Name: proportion, dtype: float64

In [11]:
"Check that within each fold, the proportions of labels are the same as in the full dataset"

# Create a DataFrame with one row containing the mean proportions of labels in the full dataset
check_proportions = pd.DataFrame([df_train[cfg.LABEL_COLS].mean().round(4)], index=["full dataset"])
check_proportions.index.name = "fold"
# Concatenate the proportions of labels for each fold
check_proportions = pd.concat([check_proportions, df_train.groupby("fold")[cfg.LABEL_COLS].mean().round(4)])
check_proportions.rename(columns=lambda x: f"proportion_{x}", inplace=True)
check_proportions

Unnamed: 0_level_0,proportion_toxic,proportion_severe_toxic,proportion_obscene,proportion_threat,proportion_insult,proportion_identity_hate
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
full dataset,0.0958,0.01,0.0529,0.003,0.0494,0.0088
0,0.0958,0.01,0.053,0.0029,0.0493,0.0094
1,0.0958,0.0095,0.0529,0.003,0.0493,0.0081
2,0.0959,0.0101,0.053,0.0029,0.0494,0.0086
3,0.0958,0.0102,0.053,0.0029,0.0494,0.0092
4,0.0958,0.0102,0.053,0.0032,0.0493,0.0086


Checked that all the folds have the same number of samples, and the proportion of classes for each label is approximately the same than in the full dataset.

## Tokenization

In [12]:
# Instantiate the tokenizer from the base model
tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_BASE)



In [13]:
# Define a function to tokenize the comments
def tokenize(batch: dict) -> dict:
    """Tokenizes the comments in the batch.
    Args:
		- batch (dict): A dictionary containing the comments to be tokenized.
	Returns:
		- dict: A dictionary containing the tokenized comments.
    Key parameters:
		- batch["comment_text"] (list): A list of comments to be tokenized.
		- padding (str): Padding strategy, set to "max_length" to fill to the maximum length.
		- truncation (bool): Whether to truncate the comments to the maximum length.
		- max_length (int): The maximum length to which the comments will be truncated or padded.
    """
    return tokenizer(
        batch["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=cfg.MAX_LENGTH_TOKENIZER
    )

In [14]:
def add_label_vector(batch: dict) -> dict:
    """
    Convert the six one-hot label columns (one per toxicity type)
    into a single **vector column** called ``"labels"`` so that
    Hugging-Face’s multi-label trainer can consume the dataset.

    Parameters:
    	- batch (dict): A mini-batch coming from Datasets while using `.map(batched=True)`.
        	Besides the tokenised fields, it still contains the six original
            binary columns listed in ``cfg.LABEL_COLS``:
            	toxic, severe_toxic, obscene, threat, insult, identity_hate

    Returns:
	    - dict: The same batch, but now with an extra key ``"labels"`` whose
        	value is a NumPy array of shape ``(batch_size, 6)`` and dtype ``float32``.
        	This is exactly the format expected by ``AutoModelForSequenceClassification``
            when ``problem_type="multi_label_classification"``.
    """
    
	# Gather the six label columns and transpose them:
    # - `batch[col]` is a list with length = batch_size
    # - `zip(*)` turns the list-of-columns into list-of-rows so each `row` holds the 6 labels for **one** example.
    batch["labels"] = [list(row) for row in zip(*[batch[col] for col in cfg.LABEL_COLS])]
    
	# Convert the list of lists into a NumPy array with dtype float32
    batch["labels"] = np.array(batch["labels"], dtype=np.float32)
    
    return batch

In [17]:
if not os.path.exists(cfg.PATH_DS_TRAIN_TOKENIZED):
    print("Tokenizing training dataset...")
    # Load the training dataset into a Dataset object from the pandas DataFrame
    ds_train_raw = Dataset.from_pandas(df_train)
    # Add the label vector to the dataset
    ds_train_raw = ds_train_raw.map(add_label_vector, batched=True)
    # Tokenize train dataset and remove the original comment text and label columns
    ds_train_tokenized = ds_train_raw.map(tokenize, batched=True, remove_columns=["comment_text"]+cfg.LABEL_COLS)
    # Save the tokenized dataset to disk
    ds_train_tokenized.save_to_disk(cfg.PATH_DS_TRAIN_TOKENIZED)
else:
    print("Loading tokenized training dataset from disk...")
    ds_train_tokenized = load_from_disk(cfg.PATH_DS_TRAIN_TOKENIZED)

Loading tokenized training dataset from disk...


In [18]:
if not os.path.exists(cfg.PATH_DS_TEST_TOKENIZED):
    print("Tokenizing test dataset...")
    # Do the same for the test dataset except without the label vector
    ds_test_raw = Dataset.from_pandas(df_test)
    ds_test_tokenized = ds_test_raw.map(tokenize, batched=True, remove_columns=["comment_text"])
    ds_test_tokenized.save_to_disk(cfg.PATH_DS_TEST_TOKENIZED)
else:
    print("Loading tokenized test dataset from disk...")
    ds_test_tokenized = load_from_disk(cfg.PATH_DS_TEST_TOKENIZED)

Loading tokenized test dataset from disk...
