<a href="https://colab.research.google.com/github/mrinaltak/HateSpeechDetection/blob/main/HatEval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Baseline
subtask_a

In [None]:
# Use Google Colab
use_colab = True
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd '/content/drive/My Drive/685'

    !pip install -r requirements.txt

!pip install sentencepiece

from datasets import load_dataset

import sys
import os
import random
import shutil
import copy
import inspect


import numpy as np
import torch
import transformers
import datasets
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import tqdm

from transformers import BertForSequenceClassification, AdamW, BertConfig, BertTokenizer


In [None]:
assert torch.cuda.is_available()
dataset = load_dataset('csv', data_files={'train': 'hateval2019_en_train.csv','test': 'hateval2019_en_test.csv','valid': 'hateval2019_en_dev.csv'})
# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

train_dataloader = torch.utils.data.DataLoader(dataset['train'], shuffle=True, batch_size=8)
val_dataloader = torch.utils.data.DataLoader(dataset['valid'], shuffle=True, batch_size=8)
test_dataloader = torch.utils.data.DataLoader(dataset['test'], shuffle=True, batch_size=8)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    cache_dir='./bert_cache'
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tell pytorch to run this model on the GPU.
model.cuda()

batch_size = 99
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
epochs = 5


In [None]:
import numpy as np
# function to get validation accuracy
def get_validation_performance(val_dataloader):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    #total_eval_loss = 0

    total_correct = 0

    for batch in val_dataloader:
      if len(batch) == 0: continue
      inputs = tokenizer(batch['text'],padding='longest',return_tensors="pt").to(device)
      labels = batch['HS'].to(device)
      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits
            
        # Accumulate the validation loss.
        #total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        # Calculate the number of correctly labeled examples in batch
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        num_correct = np.sum(pred_flat == labels_flat)
        total_correct += num_correct
        
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / (len(val_dataloader) * val_dataloader.batch_size)
    return avg_val_accuracy


In [8]:
import random

for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    total_train_loss = 0
    model.train()
    for batch in tqdm.notebook.tqdm(train_dataloader):
      if len(batch) == 0: continue
      inputs = tokenizer(batch['text'],padding='longest',return_tensors="pt").to(device)
      labels = batch['HS'].to(device)
      # Clear the previously calculated gradient
      model.zero_grad()        
      # Perform a forward pass (evaluate the model on this training batch).
      outputs = model(**inputs, labels=labels)
      loss = outputs.loss
      logits = outputs.logits
      total_train_loss += loss.item()
      # Perform a backward pass to calculate the gradients.
      loss.backward()
      # Update parameters and take a step using the computed gradient.
      optimizer.step()
    # ========================================
    #               Validation
    # ========================================
    print(f"Total loss: {total_train_loss}")
    val_acc = get_validation_performance(val_dataloader)
    print(f"Validation accuracy: {val_acc}")
    torch.save(model.state_dict(),os.path.join('./trained_model_hateval_baseline_subtask_a/', 'subtask_a_baseline_{}.pt'.format(epoch_i)))
    
print("")
print("Training complete!")

#get_validation_performance(test_dataloader)


Training...


  0%|          | 0/1125 [00:00<?, ?it/s]

Total loss: 518.9509659670293
Validation accuracy: 0.744

Training...


  0%|          | 0/1125 [00:00<?, ?it/s]

Total loss: 357.00403171777725
Validation accuracy: 0.773

Training...


  0%|          | 0/1125 [00:00<?, ?it/s]

Total loss: 206.33683990407735
Validation accuracy: 0.741

Training...


  0%|          | 0/1125 [00:00<?, ?it/s]

Total loss: 111.18521190120373
Validation accuracy: 0.759

Training...


  0%|          | 0/1125 [00:00<?, ?it/s]

Total loss: 77.48885970376432
Validation accuracy: 0.768

Training complete!


In [9]:
get_validation_performance(test_dataloader)

0.549