<a href="https://colab.research.google.com/github/ollema/nlp_offenseeval/blob/master/semeval_2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
!wget https://raw.githubusercontent.com/ollema/nlp_offenseeval/master/OLIDv1.0/olid-training-v1.0.tsv -O training.tsv
# !wget https://raw.githubusercontent.com/ollema/nlp_offenseeval/master/OLIDv1.0/offenseval-trial.txt -O trial.tsv
!wget https://raw.githubusercontent.com/ollema/nlp_offenseeval/master/OLIDv1.0/test_set_stiched.tsv -O test.tsv

# spellcorrected versions
!wget https://raw.githubusercontent.com/ollema/nlp_offenseeval/master/OLIDv1.0/olid-training_SPELLCORRECTED.tsv -O training_spellcorr.tsv
!wget https://raw.githubusercontent.com/ollema/nlp_offenseeval/master/OLIDv1.0/test_set_stiched_SPELLCORRECTED.tsv -O test_spellcorr.tsv

In [0]:
%%capture
!pip install pytorch-pretrained-bert
!pip install transformers
!pip install emoji
!pip install wordsegment

# Pre-processing

In [0]:
import csv
import emoji
from wordsegment import load, segment
load()

files_to_preprocess = ["training", "test"]
# preprocessing options
demojize_tweet = False
desegmentize_hashtags_in_tweet = True


def demojize(tweet):
    return emoji.demojize(tweet).replace(":", " ").replace("_", " ")


def desegmentize_hashtags(tweet):
    new_tweet = []
    for word in tweet.split():
        new_word = word
        if word[0] == '#':
            new_word = " ".join(segment(word[1:]))
        new_tweet.append(new_word)
    return " ".join(new_tweet)


for file_to_preprocess in files_to_preprocess:
    lines = []
    preprocessed_lines = []

    with open(file_to_preprocess + ".tsv", "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter="\t", quotechar=None)    
        for line in reader:
            lines.append(line)

    with open(file_to_preprocess + "_preprocessed.tsv", "w", encoding="utf-8") as f:
        for line in lines:
            preprocessed_line = line
            
            if demojize_tweet:
                preprocessed_line[1] = demojize(preprocessed_line[1])
            
            if desegmentize_hashtags_in_tweet:
                preprocessed_line[1] = desegmentize_hashtags(preprocessed_line[1])

            preprocessed_lines.append(preprocessed_line)
        
        
        writer = csv.writer(f, delimiter="\t", quotechar=None)
        for line in preprocessed_lines:
            writer.writerow(line)

# Model definition and data processing

In [0]:
%tensorflow_version 1.x

import csv
import sys
import os
import argparse
import random

import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.optimization import BertAdam
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW


class InputExample(object):
    def __init__(self, guid, text, label=None):
        self.guid = guid
        self.text = text
        self.label = label


class InputFeatures(object):
    def __init__(self, input_ids, attention_mask, label):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.label = label


class DataProcessor(object):
    def get_train_examples(self, training_data):
        return self._create_trn_examples(self._read_tsv(training_data))

    def get_eval_examples(self, eval_data):
        return self._create_eval_examples(self._read_tsv(eval_data))

    def get_labels(self):
        return ["0", "1"]

    def _create_trn_examples(self, lines):
        examples = []
        for (i, line) in enumerate(lines):
            if i != 0:
                guid = line[0]
                text = line[1]
                if line[2] == "OFF":
                    label = "1"
                else:
                    label = "0"
                examples.append(InputExample(guid=guid, text=text, label=label))
        return examples

    def _create_eval_examples(self, lines):
        examples = []
        for (i, line) in enumerate(lines):
            if i != 0:
                guid = line[0]
                text = line[1]
                if line[2] == "OFF":
                    label = "1"
                else:
                    label = "0"
                examples.append(InputExample(guid=guid, text=text, label=label))
        return examples

    def _read_tsv(cls, input_file, quotechar=None):
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines


def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens = tokenizer.tokenize(example.text)

        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens) > max_seq_length - 2:
            tokens = tokens[: (max_seq_length - 2)]

        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        attention_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        attention_mask += padding

        assert len(input_ids) == max_seq_length
        assert len(attention_mask) == max_seq_length

        label = label_map[example.label]
        if ex_index < 0:
            print("\n*** Example ***")
            print("guid: %s" % (example.guid))
            print("tokens: %s" % " ".join([str(x) for x in tokens]))
            print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            print("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            print("label: %s (id = %d)" % (example.label, label))

        features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label))
    return features


def accuracy(out, gold):
    guess = np.argmax(out, axis=1)
    return (guess == gold).sum()


def stats(out, gold):
    guess = np.argmax(out, axis=1)

    tp = (gold * guess).sum()
    tn = ((1 - gold) * (1 - guess)).sum()
    fp = ((1 - gold) * guess).sum()
    fn = (gold * (1 - guess)).sum()

    return tp, tn, fp, fn


def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x / warmup
    return 1.0 - x


def main(
    bert_model="bert-base-uncased",
    distil=False,
    max_seq_length=80,
    do_train=True,
    training_data="training.tsv",
    do_eval=True,
    eval_data="test.tsv",
    lower_case=True,
    train_batch_size=32,
    eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=2.0,
    warmup_proportion=0.1,
    seed=42,
    use_extra_tokens=False,
):

    output_dir = "./output/"
    extra_tokens = ['MAGA', 'antifa', 'lol', 'Kavanaugh', 'GOP','tweet', 'NRA', 'Dems', 'WWG1WGA', 'nigga', 'WalkAway']

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print(f"device: {device}")

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    os.makedirs(output_dir, exist_ok=True)

    processor = DataProcessor()
    num_labels = 2
    label_list = processor.get_labels()

    train_examples = None
    num_train_steps = None
    if do_train:
        train_examples = processor.get_train_examples(training_data)
        num_train_steps = int(len(train_examples) / train_batch_size * num_train_epochs)

    # Prepare model
    if distil:
        bert_model = "distil" + bert_model
        tokenizer = DistilBertTokenizer.from_pretrained(bert_model, do_lower_case=lower_case)
        model = DistilBertForSequenceClassification.from_pretrained(bert_model, num_labels=num_labels)
    else:
        tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=lower_case)
        model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=num_labels)

    if use_extra_tokens:
        tokenizer.add_tokens(extra_tokens)
    model.resize_token_embeddings(len(tokenizer)) 
    model.to(device)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if do_train:
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        t_total = num_train_steps
        optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=t_total)

        train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer)
        print("\n\n***** Running training *****")
        print(f"  Num examples = {len(train_examples)}")
        print(f"  Batch size = {train_batch_size}")
        print(f"  Num steps = {num_train_steps}")
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
        all_labels = torch.tensor([f.label for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_attention_mask, all_labels)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

        model.train()
        for epoch in range(int(num_train_epochs)):
            print(f"\nepoch {epoch + 1} of {int(num_train_epochs)}:", flush=True)
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(train_dataloader):
                sys.stdout.write("\r")
                sys.stdout.write(f"iteration {step:03} of {int(t_total / num_train_epochs):03}")
                sys.stdout.flush()
                batch = tuple(t.to(device) for t in batch)
                input_ids, attention_mask, labels = batch

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs[0]
                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                # modify learning rate with special warm up BERT uses
                lr_this_step = learning_rate * warmup_linear(global_step / t_total, warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group["lr"] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

    # Save a trained model
    model_to_save = model.module if hasattr(model, "module") else model  # Only save the model it-self
    output_model_file = os.path.join(output_dir, "pytorch_model.bin")
    if do_train:
        torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model
    model_state_dict = torch.load(output_model_file)
    if distil:
        model = DistilBertForSequenceClassification.from_pretrained(bert_model, num_labels=num_labels)
    else:
        model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=num_labels)
    model.resize_token_embeddings(len(tokenizer)) 
    model.load_state_dict(model_state_dict)
    model.to(device)

    if do_eval:
        eval_examples = processor.get_eval_examples(eval_data)
        eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer)
        print("\n\n\n***** Running evaluation *****")
        print(f"  Num examples = {len(eval_examples)}")
        print(f"  Batch size = {eval_batch_size}\n")
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in eval_features], dtype=torch.long)
        all_labels = torch.tensor([f.label for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_attention_mask, all_labels)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy, = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        nb_total_eval_steps = int(np.ceil(len(eval_examples) / eval_batch_size))
        eval_tp, eval_tn, eval_fp, eval_fn = 0, 0, 0, 0

        for input_ids, attention_mask, labels in eval_dataloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                tmp_eval_loss, logits = outputs[:2]

            logits = logits.detach().cpu().numpy()
            labels = labels.to("cpu").numpy()
            tmp_eval_accuracy = accuracy(logits, labels)
            tmp_eval_tp, tmp_eval_tn, tmp_eval_fp, tmp_eval_fn = stats(logits, labels)
            eval_tp += tmp_eval_tp
            eval_tn += tmp_eval_tn
            eval_fp += tmp_eval_fp
            eval_fn += tmp_eval_fn

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

            sys.stdout.write("\r")
            sys.stdout.write(f"iteration {nb_eval_steps:03} of {nb_total_eval_steps:03}")
            sys.stdout.flush()

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss / nb_tr_steps if do_train else None

        epsilon = 1e-7

        offensive_precision = eval_tp / (eval_tp + eval_fp + epsilon)
        offensive_recall = eval_tp / (eval_tp + eval_fn + epsilon)

        not_offensive_precision = eval_tn / (eval_tn + eval_fn + epsilon)
        not_offensive_recall = eval_tn / (eval_tn + eval_fp + epsilon)

        offensive_f1 = 2 * (offensive_precision * offensive_recall) / (offensive_precision + offensive_recall + epsilon)
        not_offensive_f1 = 2 * (not_offensive_precision * not_offensive_recall) / (not_offensive_precision + not_offensive_recall + epsilon)
        f1 = (offensive_f1 + not_offensive_f1) / 2

        result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "global_step": global_step, "loss": loss, "f1_score": f1}

        output_eval_file = os.path.join(output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            print("\n\n***** Results *****")
            for key in sorted(result.keys()):
                print(f"  {key} = {result[key]}")
                writer.write("%s = %s\n" % (key, str(result[key])))
        
        return f1, eval_tp, eval_tn, eval_fp, eval_fn


# Running the classifier

In [0]:
!rm -rf output

In [0]:
runs = 5
f1s, tps, tns, fps, fns = 0, 0, 0, 0, 0

for i in range(runs):
    f1, tp, tn, fp, fn = main(distil=True, seed=666 * i, training_data="training_preprocessed.tsv", eval_data="test_preprocessed.tsv", use_extra_tokens=True)
    f1s += f1
    tps += tp
    tns += tn
    fps += fp
    fns += fn

print(f"average f1-score: {f1s/runs}")
print(f"average true pos: {tps/runs}")
print(f"average true neg: {tns/runs}")
print(f"average false pos: {fps/runs}")
print(f"average false neg: {fns/runs}")

device: cuda


***** Running training *****
  Num examples = 13240
  Batch size = 32
  Num steps = 827

epoch 1 of 2:
iteration 413 of 413
epoch 2 of 2:
iteration 413 of 413


***** Running evaluation *****
  Num examples = 860
  Batch size = 8

iteration 108 of 108

***** Results *****
  eval_accuracy = 0.8593023255813953
  eval_loss = 0.3575202508105172
  f1_score = 0.8192068299288635
  global_step = 828
  loss = 0.3561617328924833
device: cuda


***** Running training *****
  Num examples = 13240
  Batch size = 32
  Num steps = 827

epoch 1 of 2:
iteration 413 of 413
epoch 2 of 2:
iteration 413 of 413


***** Running evaluation *****
  Num examples = 860
  Batch size = 8

iteration 108 of 108

***** Results *****
  eval_accuracy = 0.8569767441860465
  eval_loss = 0.35229637301354494
  f1_score = 0.8197092143575966
  global_step = 828
  loss = 0.3523307452203283
device: cuda


***** Running training *****
  Num examples = 13240
  Batch size = 32
  Num steps = 827

epoch 1 of 2:
itera