In [1]:
import getpass
import os
import re
import torch
import torch.nn as nn
from datasets import load_dataset, load_from_disk
from torch.utils.data import DataLoader
from transformers import DistilBertModel, PreTrainedModel, AutoTokenizer
import numpy as np
from transformers import TrainingArguments, Trainer
#import pandas as pd
from datasets import DatasetDict
import preprocessor as p
import evaluate
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import DistilBertConfig

In [2]:
# For efficient usage of the hardware resources when running on JupyterHub EPFL,
# we will limit the number of threads. If you are running this code on your local
# machine or on colab, the following code will not do anything.
if getpass.getuser() == "jovyan":
    num_threads_limit = 4
elif re.search('^https://.*noto.*\.epfl\.ch$', os.environ.get("EXTERNAL_URL", "")) != None:
    num_threads_limit = 2
else:
    num_threads_limit = torch.get_num_threads()
print(f"Limiting the number of threads to {num_threads_limit}")
torch.set_num_threads(num_threads_limit)
print(f"PyTorch is using {torch.get_num_threads()} threads")

_ = torch.set_flush_denormal(True) # To avoid long training time on CPU

Limiting the number of threads to 10
PyTorch is using 10 threads


### Preprocessing and Tokenization

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples, max_length=None):
    #  padding: 'max_length': pad to a length specified by the max_length argument or the
    #  maximum length accepted by the model if no max_length is provided (max_length=None).
    #  Padding will still be applied if you only provide a single sequence. [from documentation]


    #  truncation: True or 'longest_first': truncate to a maximum length specified
    #  by the max_length argument or the maximum length accepted by the model if
    #  no max_length is provided (max_length=None). This will truncate token by
    #  token, removing a token from the longest sequence in the pair until the
    #  proper length is reached. [from documentation]
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)

def preprocess_text(text):
    
    text = text.strip()

    #text = p.clean(text)
    
    # Remove quotation marks at the beginning of the string
    text= re.sub(r'^!{1,}', '', text)

    # Remove @names
    text = re.sub(r'@[^ ]+:', "user: ", text)

    # Remove @names
    text = re.sub(r'@[^ ]+\s:', "user: ", text)
    
    # Remove "RT" elements
    text = re.sub(r'\bRT\b', '', text)

    # Remove multiple whitespaces
    text = re.sub(r'\s+', ' ', text)

    # Remove "&#number" elements
    text = re.sub(r'&#[0-9]+', '', text)
    
    #Remove hashtags
    text = re.sub(r'#', '', text)
    
    # Remove multiple exclamation marks
    text = re.sub(r'!{2,}', '!', text)

    #Normalize
    text= re.sub(r'([A-Za-z])\1{2,}', r'\1', text)

    # Remove &amp;
    text = re.sub(r'&amp;', 'and', text)

    text=re.sub(r'https?://[^ ]+', '', text)

    text=re.sub(r'&;', '', text)

    text = re.sub(r'&lt;', ' ', text)

    text = re.sub(r'^:\s', '', text)

    text = re.sub(r';', '', text)

    text = re.sub(r'\s{2,}', ' ', text)

    text = p.clean(text)

    text = text.replace("\\", "")
    
    text = text.strip()
    
    return text

def split_dataset(data):
    dataset = data.train_test_split(test_size=0.2, shuffle=True, stratify_by_column="class")
    dataset_train = dataset["train"]
    test_vali = dataset["test"].train_test_split(test_size=0.5, shuffle=True, stratify_by_column="class")
    dataset_test = test_vali["test"]
    dataset_validation = test_vali["train"]

    dataset_dict = DatasetDict({"train": dataset_train,"validation": dataset_validation,"test": dataset_test})

    return dataset_dict

def get_dataloader(dataset=None,batch_size=16, max_length=512):
    if dataset is None:
        dataset=load_from_disk("path/to/dataset")
    
    tokenized_dataset = dataset.rename_column("tweet", "text")
    tokenized_dataset=tokenized_dataset.rename_column("class", "labels")
    tokenized_dataset= tokenized_dataset.remove_columns(['count', 'hate_speech_count', 'offensive_language_count', 'neither_count'])
    tokenized_dataset=tokenized_dataset.map(
        lambda examples: tokenize_function(examples, max_length=max_length),
        batched=True
    )
    columns_to_keep = ['labels', 'input_ids', 'attention_mask']
    tokenized_dataset = tokenized_dataset.select_columns(columns_to_keep)
    tokenized_dataset.set_format("torch")

    train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=True, batch_size=batch_size)
    eval_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=batch_size)
    test_dataloader = DataLoader(tokenized_dataset["test"], batch_size=batch_size)
    
    return train_dataloader, eval_dataloader, test_dataloader

In [4]:
data = load_dataset("tdavidson/hate_speech_offensive")
data = data["train"]
data = data.map(
    lambda examples: {'tweet': preprocess_text(examples['tweet'])}
)
dataset_dict = split_dataset(data)

In [5]:
batch_size = 64
train_dataloader, eval_dataloader, test_dataloader=get_dataloader(dataset_dict, batch_size=batch_size)

Map:   0%|          | 0/19826 [00:00<?, ? examples/s]

Map:   0%|          | 0/2478 [00:00<?, ? examples/s]

Map:   0%|          | 0/2479 [00:00<?, ? examples/s]

### Model

In [6]:
class DistilBERTClassifierDavidsonDataset(PreTrainedModel):
    def __init__(self, config, weights=None):
        super(DistilBERTClassifierDavidsonDataset, self).__init__(config)

        self.config = config
        
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        
        self.lin_class_1 = nn.Linear(config.hidden_size, config.dim)
        self.lin_class_2 = nn.Linear(config.dim, config.num_labels)
        
        for param in self.distilbert.parameters():
            param.requires_grad = False

        # for param in self.distilbert.transformer.layer[-3:].parameters():
        #     param.requires_grad = True

        self.dropout = nn.Dropout(config.dropout)
        self.num_labels = config.num_labels
        
        if weights is None:
            weights = torch.ones(self.num_labels)
        self.weights = weights

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        device = distilbert_output.last_hidden_state.device
        
        pooled_output = distilbert_output.last_hidden_state[:, 0, :]
        classifier_output = self.lin_class_1(pooled_output)
        classifier_output = nn.ReLU()(classifier_output)#+classifier_output
        classifier_output = self.dropout(classifier_output)
        logits = self.lin_class_2(classifier_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.weights.to(device))
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        # if not return_dict:
        #     output = (logits,) + distilbert_output[1:]
        #     return ((loss,) + output) if loss is not None else output
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


In [7]:
def compute_class_weights(dataloader, num_classes):
    class_counts = torch.zeros(num_classes)
    
    # Iterate through the dataset and count occurrences of each class
    for batch in dataloader:
        labels = batch["labels"]
        class_counts += torch.bincount(labels, minlength=num_classes)
    
    # Compute class frequencies
    class_frequencies = class_counts / class_counts.sum()

    # Compute class weights as the inverse of class frequencies
    class_weights = 1.0 / class_frequencies

    # Normalize weights to sum to 1.0
    class_weights = class_weights / class_weights.sum()
    
    return class_weights

weights = compute_class_weights(train_dataloader, 3)
config = DistilBertConfig(
    num_labels=3,  # Assuming you have three classes
    hidden_size=768,
    dim=768,  # Adjust dimensionality as needed
    dropout=0.3,  # Adjust dropout rate as needed
)

model = DistilBERTClassifierDavidsonDataset(config, weights=weights)

In [8]:
def compute_class_weights(dataloader, num_classes):
    class_counts = torch.zeros(num_classes)
    
    # Iterate through the dataset and count occurrences of each class
    for batch in dataloader:
        labels = batch["labels"]
        class_counts += torch.bincount(labels, minlength=num_classes)
    
    # Compute class frequencies
    class_frequencies = class_counts / class_counts.sum()

    # Compute class weights as the inverse of class frequencies
    class_weights = 1.0 / class_frequencies

    # Normalize weights to sum to 1.0
    class_weights = class_weights / class_weights.sum()
    
    return class_weights, class_counts, class_frequencies

### Train

In [9]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    load_accuracy = evaluate.load("accuracy")
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [14]:
repo_name = "test_graphs"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=1e-5,
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   num_train_epochs=4,
   weight_decay=0.05,
   save_strategy="epoch",
   push_to_hub=False,
   evaluation_strategy="epoch",
   warmup_steps=1000,
   #lr_scheduler_type="constant",
   #gradient_accumulation_steps=4,  # Accumulate gradients for every 4 steps
   fp16=True,  # Enable mixed precision training
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train_dataloader.dataset,
   eval_dataset=test_dataloader.dataset,
   data_collator=None,
   compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

  0%|          | 0/1240 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.2645208239555359, 'eval_accuracy': 0.903954802259887, 'eval_precision': 0.8919929550296073, 'eval_recall': 0.903954802259887, 'eval_f1': 0.8966148915159169, 'eval_runtime': 14.1044, 'eval_samples_per_second': 175.689, 'eval_steps_per_second': 2.765, 'epoch': 1.0}
{'loss': 0.2218, 'grad_norm': 2.832308292388916, 'learning_rate': 5e-06, 'epoch': 1.61}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.2663116157054901, 'eval_accuracy': 0.9003228410008071, 'eval_precision': 0.8852230794742849, 'eval_recall': 0.9003228410008071, 'eval_f1': 0.8903384606474465, 'eval_runtime': 14.2527, 'eval_samples_per_second': 173.861, 'eval_steps_per_second': 2.736, 'epoch': 2.0}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.2833469808101654, 'eval_accuracy': 0.8962873284907183, 'eval_precision': 0.8859859017080626, 'eval_recall': 0.8962873284907183, 'eval_f1': 0.8897405564856194, 'eval_runtime': 14.1259, 'eval_samples_per_second': 175.422, 'eval_steps_per_second': 2.761, 'epoch': 3.0}


KeyboardInterrupt: 

In [15]:
trainer.evaluate()

  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.23490408062934875,
 'eval_accuracy': 0.9169019766034692,
 'eval_precision': 0.907462164601783,
 'eval_recall': 0.9169019766034692,
 'eval_f1': 0.9100991719540924,
 'eval_runtime': 20.8073,
 'eval_samples_per_second': 119.141,
 'eval_steps_per_second': 1.874}

In [29]:
trainer.eval_dataset=test_dataloader.dataset
trainer.evaluate()

  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.24347041547298431,
 'eval_accuracy': 0.9120613150463897,
 'eval_precision': 0.9040360649517097,
 'eval_recall': 0.9120613150463897,
 'eval_f1': 0.9048665982550499,
 'eval_runtime': 13.3986,
 'eval_samples_per_second': 185.02,
 'eval_steps_per_second': 2.911,
 'epoch': 7.0}

In [13]:
model_path = 'checkpoint-1240'
model = DistilBERTClassifierDavidsonDataset.from_pretrained(model_path, config=config, weights=None)