<a href="https://colab.research.google.com/github/ollema/nlp_offenseeval/blob/master/NLP_offense_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pytorch-crf
!pip install transformers
!pip install wordsegment
!pip install emoji
!pip install tdqm

!wget -c https://raw.githubusercontent.com/ollema/nlp_offenseeval/master/OLIDv1.0/olid-training-v1.0.tsv

--2019-12-17 13:44:23--  https://raw.githubusercontent.com/ollema/nlp_offenseeval/master/OLIDv1.0/olid-training-v1.0.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 416 Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [0]:
%tensorflow_version 1.x
import torch
from torch import nn
import time
import torchtext
import numpy as np
import sys
from tqdm import tqdm

import random

import emoji
from wordsegment import load, segment
load()

from transformers import DistilBertTokenizer as BertTokenizer
from transformers import DistilBertForSequenceClassification as BertForSequenceClassification
from transformers import AdamW

from collections import defaultdict, Counter

import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina' 
plt.style.use('seaborn')

## 1. Reading the data

The following function reads a file with OLID instances.

Each instance contains up to 3 labels each.

Here is some example instances:

```
id	tweet	subtask_a	subtask_b	subtask_c
86426	@USER She should ask a few native Americans what their take on this is.	OFF	UNT	NULL
90194	@USER @USER Go home youâ€™re drunk!!! @USER #MAGA #Trump2020 ðŸ‘ŠðŸ‡ºðŸ‡¸ðŸ‘Š URL	OFF	TIN	IND
16820	Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT	NOT	NULL	NULL
62688	"@USER Someone should'veTaken"" this piece of shit to a volcano. ðŸ˜‚"""	OFF	UNT	NULL
43605	@USER @USER Obama wanted liberals &amp; illegals to move into red states	NOT	NULL	NULL
```
Instances are included in TSV format as follows:

`ID	INSTANCE	SUBA	SUBB	SUBC `

Whenever a label is not given, a value `NULL` is inserted (e.g. `INSTANCE	NOT	NULL	NULL`)

In [0]:
def desegmentize_hashtags_in_tweet(tweet):
    new_tweet = []
    for word in tweet.split():
        new_word = word
        if word[0] == '#':
            new_word = " ".join(segment(word[1:]))
        new_tweet.append(new_word)
    return " ".join(new_tweet)

def limit_users_in_tweet(tweet):
    new_tweet = []
    user_count = 0
    for word in tweet.split():
        if word == "@USER":
            user_count += 1
        else:
            user_count = 0
        if user_count <= 3:
            new_tweet.append(word)
    return " ".join(new_tweet)

def read_data(corpus_file, datafields, tokenizer, max_len):
    print(f'Reading sentences from {corpus_file}...')
    sys.stdout.flush()
    
    with open(corpus_file, encoding='utf-8') as f:
        next(f) # skip header line
        
        n_truncated = 0
        examples = []
        for line in f:
            line = line.strip()
            _, tweet, label, _, _ = line.split("\t")

            # desegmentize hashtags in tweet
            tweet = desegmentize_hashtags_in_tweet(tweet)

            # demojize tweet
            tweet = emoji.demojize(tweet).replace(":", " ").replace("_", " ")

            # replace URL with http
            tweet = tweet.replace("URL", "http")

            # limit the amount of consecutive @USERs in a tweet
            if tweet.count("@USER") > 3:
                tweet = limit_users_in_tweet(tweet)

            tokens = tokenizer.tokenize(tweet)
            
            # we need to truncate the sentences
            if len(tokens) > max_len-2:
                tokens = tokens[:max_len-2]
                n_truncated += 1

            tweet = " ".join(tokens)
            examples.append(torchtext.data.Example.fromlist([tweet, label], datafields))
        
        print(f'Read {len(examples)} sentences, truncated {n_truncated}.')
        return torchtext.data.Dataset(examples, datafields)

## 2. Training the classifier
Note that the `train` method returns the best F1-score seen when evaluating on the validation set.

The `classify` method will be used in the interactive demo.

In [0]:
def evaluate_validation(scores, gold):
    guesses = scores.argmax(dim=1)
    return (guesses == gold).sum().item()


class Classifier:
    def __init__(self, config, gensim_model=None, bert_model_name=None):
        self.config = config
        self.bert_model_name = bert_model_name
        lowercase = 'uncased' in bert_model_name
        print('Lowercased BERT model?', lowercase)
        
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=lowercase)
        pad = self.tokenizer.pad_token           
        # self.WORD = torchtext.data.Field(init_token=self.tokenizer.cls_token, eos_token=self.tokenizer.sep_token, sequential=True, lower=lowercase, pad_token=pad, batch_first=True)
        # self.LABEL = torchtext.data.Field(is_target=True, init_token='O', eos_token=pad, pad_token=pad, sequential=True, unk_token=None, batch_first=True)

        self.WORD = torchtext.data.Field(sequential=True, tokenize=self.tokenizer.tokenize, pad_token=self.tokenizer.pad_token, init_token=self.tokenizer.cls_token, eos_token=self.tokenizer.sep_token)
        self.LABEL = torchtext.data.LabelField(is_target=True)

        self.fields = [('tweet', self.WORD), ('label', self.LABEL)]     
        self.device = 'cuda'
                
    def train(self):
        print('Reading and tokenizing...')
        dataset = read_data(self.config.dataset, self.fields, self.tokenizer, 128) 
        train, valid = dataset.split([0.8, 0.2])

        self.LABEL.build_vocab(train)
        self.WORD.build_vocab(train)
        # Here, we tell torchtext to use the vocabulary of BERT's tokenizer.
        # .stoi is the map from strings to integers, and itos from integers to strings.
        self.WORD.vocab.stoi = self.tokenizer.vocab
        self.WORD.vocab.itos = list(self.tokenizer.vocab)

        print(f"Using BertForSequenceClassification")
        self.model = BertForSequenceClassification.from_pretrained(self.bert_model_name, num_labels=2)
        self.model.to(self.device)
            
        train_iterator = torchtext.data.BucketIterator(
            train,
            device=self.device,
            batch_size=self.config.train_batch_size,
            sort_key=lambda x: len(x.tweet),
            repeat=False,
            train=True,
            sort=True)

        valid_iterator = torchtext.data.BucketIterator(
            valid,
            device=self.device,
            batch_size=self.config.valid_batch_size,
            sort_key=lambda x: len(x.tweet),
            repeat=False,
            train=False,
            sort=True)
        
        # train_iterator = torchtext.data.Iterator(
        #     train,
        #     device=device,
        #     batch_size=32,
        #     repeat=False,
        #     train=True,
        #     sort=False)

        # valid_iterator = torchtext.data.Iterator(
        #     valid,
        #     device=device,
        #     batch_size=32,
        #     repeat=False,
        #     train=False,
        #     sort=False)
        
        train_batches = list(train_iterator)
        valid_batches = list(valid_iterator)
        
        no_decay = ['bias', 'LayerNorm.weight']
        decay = 0.01
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': decay},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
            
        # As discussed above, we use the AdamW optimizer from the transformers library. It seems to
        # give slightly better results than the standard Adam.
        optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
        
        history = defaultdict(list)    
        best_f1 = -1
        
        for epoch in range(1, self.config.n_epochs + 1):

            t0 = time.time()
        
            loss_sum = 0
            n_batches = 0

            self.model.train()
            
            print('Training')
            sys.stdout.flush()

            for i, batch in enumerate(tqdm(train_iterator)):

                tweets = batch.tweet.t()

                optimizer.zero_grad()

                outputs = self.model(tweets, labels=batch.label)
                            
                loss = outputs[0]
                
                loss.backward()
                optimizer.step()
                            
                loss_sum += loss.item()
                n_batches += 1
            
            train_loss = loss_sum / n_batches
            history['train_loss'].append(train_loss)
            
            n_correct = 0
            n_valid = len(valid)
            loss_sum = 0
            n_batches = 0

            self.model.eval()

            print('\nValidating', end='')
            sys.stdout.flush()
            
            for i, batch in enumerate(tqdm(valid_iterator)):
                tweets = batch.tweet.t()
                
                with torch.no_grad():
                    outputs = self.model(tweets, labels=batch.label)
                    loss_batch, scores = outputs
                    
                loss_sum += loss_batch.item()
                n_correct += evaluate_validation(scores, batch.label)
                n_batches += 1

            val_acc = n_correct / n_valid
            val_loss = loss_sum / n_batches

            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_acc)   
                    
            t1 = time.time()
            print()
            print(f'Epoch {epoch}: train loss = {train_loss:.4f}, val loss = {val_loss:.4f}, val acc: {val_acc:.4f}, time = {t1-t0:.4f}')
        
        return history["val_acc"][-1]
        
    # def tag(self, sentences):
    #     
        

In [0]:
class ClassifierConfig(object):
    
    # Location of training and validation data.
    dataset = 'olid-training-v1.0.tsv'
    
    # Batch size for the training and validation set.
    train_batch_size = 32
    valid_batch_size = 32
    
    # Number of training epochs.
    n_epochs=1
    
    # Word dropout probability.
    word_dropout_prob = 0.2


In [18]:
f_scores = []

for i in range(1):
    torch.manual_seed(i * 1000) and random.seed(i * 1000)

    classifier = Classifier(config=ClassifierConfig(), bert_model_name="distilbert-base-uncased")

    f_scores.append(classifier.train())

print(f"mean f-score: {np.mean(f_scores)}")


Lowercased BERT model? True
Reading and tokenizing...
Reading sentences from olid-training-v1.0.tsv...
Read 13240 sentences, truncated 1.
Using BertForSequenceClassification
Training


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 331/331 [01:25<00:00,  1.41it/s]


Validating


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 83/83 [00:06<00:00,  4.86it/s]


Epoch 1: train loss = 0.4771, val loss = 0.4447, val acc: 0.7893, time = 91.8168
mean f-score: 0.7892749244712991



