<a href="https://colab.research.google.com/github/morozowdmitry/semeval21/blob/master/sem21_bert_tokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#import libs

In [None]:
import ast
import csv
import random
import statistics
import sys
import string
import sklearn
import itertools

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
!pip install transformers
import transformers
from transformers import BertTokenizer, BertConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

#fix spans

In [None]:
SPECIAL_CHARACTERS = string.whitespace

def _contiguous_ranges(span_list):
    """Extracts continguous runs [1, 2, 3, 5, 6, 7] -> [(1,3), (5,7)]."""
    output = []
    for _, span in itertools.groupby(
        enumerate(span_list), lambda p: p[1] - p[0]):
        span = list(span)
        output.append((span[0][1], span[-1][1]))
    return output


def fix_spans(spans, text, special_characters=SPECIAL_CHARACTERS):
    """Applies minor edits to trim spans and remove singletons."""
    cleaned = []
    for begin, end in _contiguous_ranges(spans):
        while text[begin] in special_characters and begin < end:
            begin += 1
        while text[end] in special_characters and begin < end:
            end -= 1
        if end - begin > 1:
            cleaned.extend(range(begin, end + 1))
    return cleaned

#f1

In [None]:
def f1(predictions, gold):
    """
    F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
    >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
    :param predictions: a list of predicted offsets
    :param gold: a list of offsets serving as the ground truth
    :return: a score between 0 and 1
    """
    if len(gold) == 0:
        return 1. if len(predictions) == 0 else 0.
    if len(predictions) == 0:
        return 0.
    predictions_set = set(predictions)
    gold_set = set(gold)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)


#import data

In [None]:
def spans_to_ents(doc, spans, label):
  """Converts span indicies into spacy entity labels."""
  started = False
  left, right, ents = 0, 0, []
  for x in doc:
    if x.pos_ == 'SPACE':
      continue
    if spans.intersection(set(range(x.idx, x.idx + len(x.text)))):
      if not started:
        left, started = x.idx, True
      right = x.idx + len(x.text)
    elif started:
      ents.append((left, right, label))
      started = False
  if started:
    ents.append((left, right, label))
  return ents


def read_datafile(filename):
  """Reads csv file with python span list and text."""
  data = []
  with open(filename) as csvfile:
    reader = csv.DictReader(csvfile)
    count = 0
    for row in reader:
      fixed = fix_spans(
          ast.literal_eval(row['spans']), row['text'])
      data.append((fixed, row['text']))
  return data

In [None]:
print('loading training data')
train = read_datafile('/content/drive/My Drive/semeval21/tsd_train.csv')
print('loading test data')
test = read_datafile('/content/drive/My Drive/semeval21/tsd_trial.csv')

In [None]:
train_labels = []
train_texts = []

for i, elem in enumerate(train):
  #if elem[0]!=[]:
    words = elem[1].split()
    num = 0
    labels = []
    for word in words:
      if num in set(elem[0]):
        num+=(len(word)+1)
        labels.append('TOXIC')
      else:
        num+=(len(word)+1)
        labels.append('norm')
    train_labels.append(labels)
    train_texts.append(words)
print(len(train_labels))

test_labels = []
test_texts = []
test_spans = []
for i, elem in enumerate(test):
    words = elem[1].split()
    #print(words)
    num = 0
    labels = []
    span = []
    temp_str = elem[1]
    for word in words:
      pos = temp_str.find(word)
      span.append([num + pos, num+pos+len(word.translate(str.maketrans('', '', string.punctuation)))-1])
      temp_str = temp_str[pos+len(word.translate(str.maketrans('', '', string.punctuation)))-1:]
      num+=pos+len(word.translate(str.maketrans('', '', string.punctuation)))-1
      if num in set(elem[0]):
        #num+=(len(word)+1)
        labels.append('TOXIC')
      else:
        #num+=(len(word)+1)
        labels.append('norm')
    test_labels.append(labels) 
    test_texts.append(words)
    test_spans.append(span)

In [None]:
for i in range(len(test_spans[40])):
  print(test_texts[40][i], test_spans[40][i], test_labels[40][i])

In [None]:
new_train_texts = []
new_test_texts = []
new_test_spans = []

import string

def letter_digit(text):
  k = False
  for t in text:
    if t.isdigit() or t.isalpha():
      k=True
      break
  return k

def clean(text):
  new_text = ''
  for t in text:
    if t.isalpha() or t.isdigit():
      new_text+=t
  return new_text

for text in train_texts:
  words = []
  for word in text:
    word = word.replace('\r\n','')
    word = word.replace('\n','')
    pos = word.find('\'')
    if pos>-1:
      word = word[:pos]
    word = clean(word)
    if letter_digit(word):
      words.append(word.lower())
  new_train_texts.append(words)

for j, text in enumerate(test_texts):
  words = []
  span = []
  for i, word in enumerate(text):
    word = word.replace('\r\n','')
    word = word.replace('\n','')
    pos = word.find('\'')
    if pos>0:
      word = word[:pos]
    else:
      word = word.replace('\'','')
    word = clean(word)
    if letter_digit(word):
      words.append(word.lower())
      span.append(test_spans[j][i])
  new_test_texts.append(words)
  new_test_spans.append(span)

In [None]:
new_test_spans[2]

#model

In [None]:
MAX_LEN = 100
bs = 8
RANDOM_STATE = 42

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)
#tokenizer = RobertaTokenizer.from_pretrained('roberta-large', do_lower_case = True)

In [None]:
def tokenize_and_preserve_labels(texts, labels):
    tokenized_sentence = ['[CLS]']
    labels_ = ['norm']

    for i in range(len(texts)):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(texts[i])
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels_.extend([labels[i]] * n_subwords)

    tokenized_sentence.append('[SEP]')
    labels_.append('norm')
    return tokenized_sentence, labels_

In [None]:
tokenized_train_texts = []
tokenized_train_labels = []

for i in range(len(train_texts)):
  text, label = tokenize_and_preserve_labels(new_train_texts[i], train_labels[i])
  tokenized_train_texts.append(text)
  tokenized_train_labels.append(label)

tokenized_test_texts = []
tokenized_test_labels = []

for i in range(len(test_texts)):
  text, label = tokenize_and_preserve_labels(new_test_texts[i], test_labels[i])
  tokenized_test_texts.append(text)
  tokenized_test_labels.append(label)

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_train_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

tag_values = ["TOXIC", "norm"]
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in tokenized_train_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
 tokens = tokenizer.convert_ids_to_tokens(input_ids[3])
 print(tokens)

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=RANDOM_STATE, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=RANDOM_STATE, test_size=0.1)

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [None]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-large-uncased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
'''
model = RobertaForTokenClassification.from_pretrained(
    "roberta-large",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
'''

In [None]:
model.cuda();

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.001}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 2
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
!pip install seqeval
import seqeval
from seqeval.metrics import accuracy_score

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []
!pip install tqdm
import tqdm
import numpy as np

# Set the seed value all over the place to make this reproducible.
seed_val = RANDOM_STATE

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for _ in tqdm.trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    #print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

#test

In [None]:
answers = []
sentences = []

for i, text in enumerate(new_test_texts):
  tokenized_sentence = tokenizer.encode(tokenized_test_texts[i])
  #tokinized_sentence = tokenized_sentence[1:len(tokenized_sentence)-1]

  #input_ids = torch.tensor([tokenized_sentence]).cuda()
  input_ids = torch.tensor([tokenized_sentence[1:len(tokenized_sentence)-1]]).cuda()
  with torch.no_grad():
    output = model(input_ids)
  label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

  new_tokens, new_labels = [], []
  for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
          new_tokens[-1] = new_tokens[-1] + token[2:]
          if tag_values[label_idx]=='TOXIC':
            new_labels[len(new_labels)-1] = 'TOXIC'
    else:
          new_labels.append(tag_values[label_idx])
          new_tokens.append(token)

  sentences.append(new_tokens)
  answers.append(new_labels)

In [None]:
import pickle

with open('/content/drive/My Drive/semeval21/answers.pickle', 'wb') as f:
     pickle.dump(answers, f)
with open('/content/drive/My Drive/semeval21/sentences.pickle', 'wb') as f:
     pickle.dump(sentences, f)
with open('/content/drive/My Drive/semeval21/true.pickle', 'wb') as f:
     pickle.dump(test_labels, f)

#get f1

In [None]:
import pickle

with open('/content/drive/My Drive/semeval21/answers.pickle', 'rb') as f:
     answers = pickle.load(f)
with open('/content/drive/My Drive/semeval21/sentences.pickle', 'rb') as f:
     sentences = pickle.load(f)
with open('/content/drive/My Drive/semeval21/true.pickle', 'rb') as f:
     test_labels = pickle.load(f)

In [None]:
spans_prediction = []

for i in range(len(answers)):
  answers[i] = answers[i][1:len(answers[i])-1]
  sentences[i] = sentences[i][1:len(sentences[i])-1]

for i, ans in enumerate(answers):
  span = []
  #if spans_true[i] == []:
    #spans_prediction.append(span)
    #continue
  for j, a in enumerate(ans):
    if a=='TOXIC':
      for z in range(new_test_spans[i][j][0],new_test_spans[i][j][1]+1):
        span.append(z)
  spans_prediction.append(span)

In [None]:
for i in range(len(spans_prediction)):
  if spans_prediction[i]!=[]:
    span = []
    span.append(spans_prediction[i][0])
    for s in range(1, len(spans_prediction[i])):
      #print('*',spans_prediction[i][s])
      #print(span[len(span)-1])
      if spans_prediction[i][s]==span[len(span)-1]+2:
        span.append(spans_prediction[i][s]-1)
        span.append(spans_prediction[i][s])
      else:
        span.append(spans_prediction[i][s])
    spans_prediction[i]=span

In [None]:
spans_true = []

for t in test:
    spans_true.append(t[0])

In [None]:
import numpy as np
f1_scores = []

for i in range(len(spans_true)):
  f1_scores.append(f1(spans_true[i], spans_prediction[i]))

np.mean(np.array(f1_scores))

In [None]:
with open('/content/drive/My Drive/semeval21/rs42.pickle', 'wb') as f:
     pickle.dump(spans_prediction, f)

#get ensemble value

In [None]:
'''
import pickle
with open('/content/drive/My Drive/semeval21/rs23.pickle', 'rb') as f:
     rs23 = pickle.load(f)
with open('/content/drive/My Drive/semeval21/rs30.pickle', 'rb') as f:
     rs30 = pickle.load(f)
with open('/content/drive/My Drive/semeval21/rs42.pickle', 'rb') as f:
     rs42 = pickle.load(f)

In [None]:
'''
final = []
for i in range(len(rs23)):
  a = set()
  a.update(rs23[i])
  a.update(rs30[i])
  a.update(rs42[i])
  a = list(a)
  a.sort()
  span = []
  '''
  '''
  for el in a:
    count = 0
    if el in rs23[i]:
      count+=1
    if el in rs30[i]:
      count+=1
    if el in rs42[i]:
      count+=1
    if count>=2:
      span.append(el)
  '''
  '''
  span.extend(a)
  final.append(span)

In [None]:
'''
for i, el in enumerate(spans_true):
  if el==[]:
    print(i, len(list(final[i])), final[i])

In [None]:
'''
import numpy as np
f1_scores = []

for i in range(len(spans_true)):
  f1_scores.append(f1(spans_true[i], final[i]))

np.mean(np.array(f1_scores))