In [None]:
# Use Google Colab
use_colab = True
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    drive._mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd '/content/drive/My Drive/685'

    !pip install -r requirements.txt

!pip install sentencepiece

In [5]:
from datasets import load_dataset

import sys
import os
import random
import shutil
import copy
import inspect


import numpy as np
import torch
import transformers
import datasets
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import tqdm
dataset = load_dataset('csv', data_files={'train': 'hateval2019_en_train.csv','test': 'hateval2019_en_test.csv','valid': 'hateval2019_en_dev.csv'})
from sklearn.metrics import classification_report

import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
set_seed(685)

# assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")
print(device)

tokenizer = T5Tokenizer.from_pretrained('t5-small', cache_dir='./t5_cache')
model = T5ForConditionalGeneration.from_pretrained('t5-small', cache_dir='./t5_cache')

# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)

test_dataloader = torch.utils.data.DataLoader(dataset['test'], shuffle=True, batch_size=20)
train_dataloader = torch.utils.data.DataLoader(dataset['train'], shuffle=True, batch_size=20)
val_dataloader = torch.utils.data.DataLoader(dataset['valid'], shuffle=True, batch_size=20)

Using custom data configuration default-24d0d3aaf0edc963


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-24d0d3aaf0edc963/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-24d0d3aaf0edc963/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
def eval(test_dataloader, subtask = 'a'):
  model.cuda()
  y_pred = []
  y_true = []
  texts  = []
  with torch.no_grad():
    model.eval()
    for batch in tqdm.notebook.tqdm(test_dataloader):
      sentences = batch['text'] # use different length sentences to test batching
      if subtask=='a':
        sentences = [sentence + '<\s> ' + 'Hate speech is ' for sentence in sentences]
      elif subtask =='b':
        sentences = [sentence + '<\s> ' + 'Targeted hate speech is ' for sentence in sentences]
      else:
        sentences = [sentence + '<\s> ' + 'Offensive speech is ' for sentence in sentences]
      inputs = tokenizer(sentences, return_tensors="pt", padding=True)
      output_sequences = model.generate(
          input_ids=inputs['input_ids'].to(device),
          attention_mask=inputs['attention_mask'].to(device),
          max_length=12,
          early_stopping=True,
          num_beams=10,
          num_return_sequences=1,
          no_repeat_ngram_size=2
      )
      batch_preds = (tokenizer.batch_decode(output_sequences, skip_special_tokens=True,clean_up_tokenization_spaces=True))
      for i,pred in enumerate(batch_preds):
        if "present." == pred:
          y_pred.append(1)
          if subtask=='a' and batch['HS'][i].item()==0:
            texts.append(batch['text'][i])
          if subtask=='b' and batch['TR'][i].item()==0:
            texts.append(batch['text'][i])
          if subtask=='c' and batch['AG'][i].item()==0:
            texts.append(batch['text'][i])
        else:
          y_pred.append(0)
          if subtask=='a' and batch['HS'][i].item()==1:
            texts.append(batch['text'][i])
          if subtask=='b' and batch['TR'][i].item()==1:
            texts.append(batch['text'][i])
          if subtask=='c' and batch['AG'][i].item()==1:
            texts.append(batch['text'][i])
      if subtask=='a':
        y_true.extend(batch['HS'])
      elif subtask =='b':
        y_true.extend(batch['TR'])
      else:
        y_true.extend(batch['AG'])
      
  report = classification_report(y_true=y_true, y_pred=y_pred)
  print ("val accuracy: ", report)
  return texts

In [None]:
max_source_length = 512
max_target_length = 12
training_epochs = 5

In [None]:
model.cuda()

for epoch in range(training_epochs):
  model.train()
  print ("epoch ",epoch)
  for batch in tqdm.notebook.tqdm(train_dataloader):
    input_sequences = []
    output_sequences = []
    for idx, text in enumerate(batch['text']):
      if batch['HS'][idx] == 0:
        input_sequences.append(text + '<\s> ' + 'Hate speech is ')
        output_sequences.append("not present.")
      else:
        input_sequences.append(text + '<\s> ' + 'Hate speech is ')
        output_sequences.append("present.") 
        if batch['TR'][idx] == 0:
          input_sequences.append(text + '<\s> ' + 'Targeted hate speech is ')
          output_sequences.append("not present.")
        else:
          input_sequences.append(text + '<\s> ' + 'Targeted hate speech is ')
          output_sequences.append("present.")
        if batch['AG'][idx] == 0:
          input_sequences.append(text + '<\s> ' + 'Offensive speech is ')
          output_sequences.append("not present.")
        else:
          input_sequences.append(text + '<\s> ' + 'Offensive speech is ')
          output_sequences.append("present.")
          
    # encode the inputs
    
    encoding = tokenizer(input_sequences,
                        padding='longest',
                        max_length=max_source_length,
                        truncation=True,
                        return_tensors="pt")
    input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

    # encode the targets
    target_encoding = tokenizer(output_sequences,
                                padding='longest',
                                max_length=max_target_length,
                                truncation=True)
    labels = target_encoding.input_ids

    # replace padding token id's of the labels by -100
    labels = [
              [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
    ]
    input_ids=input_ids.to(device)
    attention_mask=attention_mask.to(device)
    labels=torch.as_tensor(labels).to(device)
    # forward pass
    loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  eval(val_dataloader)
  torch.save(model.state_dict(),os.path.join('./hatEval_trained_models/', 't5_full_shot_epoch_{}.pt'.format(epoch)))

epoch  0


  0%|          | 0/450 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

val accuracy:                precision    recall  f1-score   support

           0       0.68      0.86      0.76       573
           1       0.70      0.45      0.55       427

    accuracy                           0.68      1000
   macro avg       0.69      0.65      0.65      1000
weighted avg       0.69      0.68      0.67      1000

epoch  1


  0%|          | 0/450 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

val accuracy:                precision    recall  f1-score   support

           0       0.77      0.70      0.73       573
           1       0.64      0.71      0.67       427

    accuracy                           0.71      1000
   macro avg       0.70      0.71      0.70      1000
weighted avg       0.71      0.71      0.71      1000

epoch  2


  0%|          | 0/450 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

val accuracy:                precision    recall  f1-score   support

           0       0.70      0.82      0.76       573
           1       0.69      0.54      0.60       427

    accuracy                           0.70      1000
   macro avg       0.70      0.68      0.68      1000
weighted avg       0.70      0.70      0.69      1000

epoch  3


  0%|          | 0/450 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

val accuracy:                precision    recall  f1-score   support

           0       0.76      0.78      0.77       573
           1       0.69      0.67      0.68       427

    accuracy                           0.73      1000
   macro avg       0.73      0.72      0.73      1000
weighted avg       0.73      0.73      0.73      1000

epoch  4


  0%|          | 0/450 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

val accuracy:                precision    recall  f1-score   support

           0       0.71      0.88      0.79       573
           1       0.76      0.52      0.61       427

    accuracy                           0.72      1000
   macro avg       0.74      0.70      0.70      1000
weighted avg       0.73      0.72      0.71      1000



In [23]:
model.load_state_dict(torch.load(os.path.join('./hatEval_trained_models/', 't5_full_shot_epoch_4.pt')))
result = eval(test_dataloader,subtask='a')

with open('finetune_hateval_a_t5_discrete.txt', 'w') as f:
    for item in result:
        f.write("%s\n" % item)

  0%|          | 0/150 [00:00<?, ?it/s]

val accuracy:                precision    recall  f1-score   support

           0       0.79      0.21      0.33      1740
           1       0.46      0.93      0.61      1260

    accuracy                           0.51      3000
   macro avg       0.63      0.57      0.47      3000
weighted avg       0.65      0.51      0.45      3000



In [27]:
result = eval(test_dataloader,subtask='b')

with open('finetune_hateval_b_t5_discrete.txt', 'w') as f:
    for item in result:
        f.write("%s\n" % item)

  0%|          | 0/150 [00:00<?, ?it/s]

val accuracy:                precision    recall  f1-score   support

           0       0.98      0.65      0.78      2471
           1       0.36      0.95      0.53       529

    accuracy                           0.70      3000
   macro avg       0.67      0.80      0.65      3000
weighted avg       0.87      0.70      0.73      3000



In [28]:
result = eval(test_dataloader,subtask='c')

with open('finetune_hateval_c_t5_discrete.txt', 'w') as f:
    for item in result:
        f.write("%s\n" % item)

  0%|          | 0/150 [00:00<?, ?it/s]

val accuracy:                precision    recall  f1-score   support

           0       0.85      0.86      0.85      2406
           1       0.39      0.37      0.38       594

    accuracy                           0.76      3000
   macro avg       0.62      0.61      0.61      3000
weighted avg       0.76      0.76      0.76      3000



In [32]:
with open('finetune_hateval_a_t5_discrete.txt') as f:
    a_lines = f.read().splitlines()

with open('finetune_hateval_b_t5_discrete.txt') as f:
    b_lines = f.read().splitlines()

with open('finetune_hateval_c_t5_discrete.txt') as f:
    c_lines = f.read().splitlines()


EMR Score = 0.2983

In [34]:
len(list(set(a_lines+b_lines+c_lines)))

2105

Zero shot analysis


In [37]:
#model.load_state_dict(torch.load(os.path.join('./olid_trained_models_v2/', 't5_full_shot_epoch_4.pt')))
#result_a = eval(test_dataloader,subtask='a')

with open('zeroshot_hateval_a_t5_discrete.txt', 'w') as f:
    for item in result_a:
        f.write("%s\n" % item)

#result_b = eval(test_dataloader,subtask='b')

with open('zeroshot_hateval_b_t5_discrete.txt', 'w') as f:
    for item in result_b:
        f.write("%s\n" % item)

#result_c = eval(test_dataloader,subtask='c')

with open('zeroshot_hateval_c_t5_discrete.txt', 'w') as f:
    for item in result_c:
        f.write("%s\n" % item)

In [36]:
len(list(set(result_a+result_b+result_c)))

2789