<a href="https://colab.research.google.com/github/ollema/nlp_offenseeval/blob/master/NLP_offense_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
%tensorflow_version 1.x
!pip install pytorch-crf
!pip install transformers
!pip install wordsegment
!pip install emoji
!pip install tdqm

!wget -c https://raw.githubusercontent.com/ollema/nlp_offenseeval/master/OLIDv1.0/olid-training-v1.0.tsv

from collections import defaultdict
import random
import sys
import time

import torch
from torch import nn
import torchtext
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from transformers import AdamW

import emoji
from wordsegment import load, segment
load()

from tqdm import tqdm


## 1. Reading the data

<!-- The following function reads a file with OLID instances.

Each instance contains up to 3 labels each.

Here is some example instances:

```
id	tweet	subtask_a	subtask_b	subtask_c
86426	@USER She should ask a few native Americans what their take on this is.	OFF	UNT	NULL
90194	@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL	OFF	TIN	IND
16820	Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT	NOT	NULL	NULL
62688	"@USER Someone should'veTaken"" this piece of shit to a volcano. 😂"""	OFF	UNT	NULL
43605	@USER @USER Obama wanted liberals &amp; illegals to move into red states	NOT	NULL	NULL
```
Instances are included in TSV format as follows:

`ID	INSTANCE	SUBA	SUBB	SUBC `

Whenever a label is not given, a value `NULL` is inserted (e.g. `INSTANCE	NOT	NULL	NULL`) -->

In [0]:
def desegmentize_hashtags_in_tweet(tweet):
    new_tweet = []
    for word in tweet.split():
        new_word = word
        if word[0] == '#':
            new_word = " ".join(segment(word[1:]))
        new_tweet.append(new_word)
    return " ".join(new_tweet)

def limit_users_in_tweet(tweet):
    new_tweet = []
    user_count = 0
    for word in tweet.split():
        if word == "@USER":
            user_count += 1
        else:
            user_count = 0
        if user_count <= 3:
            new_tweet.append(word)
    return " ".join(new_tweet)

def read_data(corpus_file, datafields, tokenizer, max_len):
    print(f'Reading sentences from {corpus_file}...')
    sys.stdout.flush()
    
    with open(corpus_file, encoding='utf-8') as f:
        next(f) # skip header line
        
        n_truncated = 0
        examples = []
        for line in f:
            line = line.strip()
            _, tweet, label, _, _ = line.split("\t")

            # desegmentize hashtags in tweet
            tweet = desegmentize_hashtags_in_tweet(tweet)

            # demojize tweet
            tweet = emoji.demojize(tweet).replace(":", " ").replace("_", " ")

            # replace URL with http
            tweet = tweet.replace("URL", "http")

            # limit the amount of consecutive @USERs in a tweet
            if tweet.count("@USER") > 3:
                tweet = limit_users_in_tweet(tweet)

            tokens = tokenizer.tokenize(tweet)
            
            # we need to truncate the sentences
            if len(tokens) > max_len-2:
                tokens = tokens[:max_len-2]
                n_truncated += 1

            tweet = " ".join(tokens)
            examples.append(torchtext.data.Example.fromlist([tweet, label], datafields))
        
        print(f'Read {len(examples)} sentences, truncated {n_truncated}.')
        return torchtext.data.Dataset(examples, datafields)

## 2. Training the classifier
<!-- Note that the `train` method returns the best F1-score seen when evaluating on the validation set.

The `classify` method will be used in the interactive demo. -->

In [0]:
def evaluate_validation(scores, gold):
    guesses = scores.argmax(dim=1)
    return (guesses == gold).sum().item()


class Classifier:
    def __init__(self, bert_model_name, distil, batch_size, epochs):
        self.device = 'cuda'
        self.distil = distil
        
        if distil:
            self.bert_model_name = "distil" + bert_model_name
            self.tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)
            self.model = DistilBertForSequenceClassification.from_pretrained(self.bert_model_name, num_labels=2)
            self.model.to(self.device)
        else:
            self.bert_model_name = bert_model_name
            self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
            self.model = BertForSequenceClassification.from_pretrained(self.bert_model_name, num_labels=2)
            self.model.to(self.device)

        print(f"Using {self.bert_model_name}")

        self.batch_size = batch_size
        self.epochs = epochs
            
        self.WORD = torchtext.data.Field(
            sequential=True, 
            tokenize=self.tokenizer.tokenize,
            pad_token=self.tokenizer.pad_token,
            init_token=self.tokenizer.cls_token,
            eos_token=self.tokenizer.sep_token
        )
        self.LABEL = torchtext.data.LabelField(is_target=True)
        self.fields = [('tweet', self.WORD), ('label', self.LABEL)]     
        
                
    def train(self):
        print('Reading data...')
        dataset = read_data("olid-training-v1.0.tsv", self.fields, self.tokenizer, 128) 
        train, valid = dataset.split([0.8, 0.2])

        self.LABEL.build_vocab(train)
        self.WORD.build_vocab(train)
        self.WORD.vocab.stoi = self.tokenizer.vocab
        self.WORD.vocab.itos = list(self.tokenizer.vocab)
        
        train_iterator = torchtext.data.BucketIterator(
            train,
            device=self.device,
            batch_size=self.batch_size,
            sort_key=lambda x: len(x.tweet),
            repeat=False,
            train=True,
            sort=True)

        valid_iterator = torchtext.data.BucketIterator(
            valid,
            device=self.device,
            batch_size=self.batch_size,
            sort_key=lambda x: len(x.tweet),
            repeat=False,
            train=False,
            sort=True)
        
        train_batches = list(train_iterator)
        valid_batches = list(valid_iterator)
        
        no_decay = ['bias', 'LayerNorm.weight']
        decay = 0.01
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': decay},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
            
        optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
        
        history = defaultdict(list)    
        
        for epoch in range(1, self.epochs + 1):

            t0 = time.time()
        
            loss_sum = 0
            n_batches = 0

            self.model.train()
            train_fmt = "{desc}:   {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}{postfix}]"
            for i, batch in enumerate(tqdm(train_iterator, desc="Training", bar_format=train_fmt)):

                tweets = batch.tweet.t()

                optimizer.zero_grad()

                outputs = self.model(tweets, labels=batch.label)
                            
                loss = outputs[0]
                
                loss.backward()
                optimizer.step()
                            
                loss_sum += loss.item()
                n_batches += 1
            
            train_loss = loss_sum / n_batches
            history['train_loss'].append(train_loss)
            
            n_correct = 0
            n_valid = len(valid)
            loss_sum = 0
            n_batches = 0

            self.model.eval()
            valid_fmt = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}{postfix}]"
            for i, batch in enumerate(tqdm(valid_iterator, desc="Validating", bar_format=valid_fmt)):
                tweets = batch.tweet.t()
                
                with torch.no_grad():
                    outputs = self.model(tweets, labels=batch.label)
                    loss_batch, scores = outputs
                    
                loss_sum += loss_batch.item()
                n_correct += evaluate_validation(scores, batch.label)
                n_batches += 1

            val_acc = n_correct / n_valid
            val_loss = loss_sum / n_batches

            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_acc)   
                    
            t1 = time.time()
            print()
            print(f'Epoch {epoch}: train loss = {train_loss:.4f}, val loss = {val_loss:.4f}, val acc: {val_acc:.4f}, time = {t1-t0:.4f}')
        
        return history["val_acc"][-1]
        

In [7]:
f_scores = []

for i in range(1):
    torch.manual_seed(i * 1000) and random.seed(i * 1000)

    classifier = Classifier(
        bert_model_name="bert-base-uncased",
        distil=True,
        batch_size=32,
        epochs=2)

    f_scores.append(classifier.train())

print(f"mean f-score: {np.mean(f_scores)}")


Using distilbert-base-uncased
Reading data...
Reading sentences from olid-training-v1.0.tsv...
Read 13240 sentences, truncated 1.


KeyError: ignored