In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 28.4MB/s eta 0:00:01[K     |█▏                              | 20kB 5.7MB/s eta 0:00:01[K     |█▊                              | 30kB 8.2MB/s eta 0:00:01[K     |██▎                             | 40kB 5.3MB/s eta 0:00:01[K     |███                             | 51kB 6.5MB/s eta 0:00:01[K     |███▌                            | 61kB 7.7MB/s eta 0:00:01[K     |████                            | 71kB 8.8MB/s eta 0:00:01[K     |████▋                           | 81kB 7.0MB/s eta 0:00:01[K     |█████▎                          | 92kB 7.7MB/s eta 0:00:01[K     |█████▉                          | 102kB 8.5MB/s eta 0:00:01[K     |██████▍                         | 112kB 8.5MB/s eta 0:00:01[K     |███████                         | 122kB 8.5M

In [0]:
import os
import sys
import torch
import csv
import numpy as np

In [0]:
home_dir = "gdrive/My Drive/propaganda_detection"
data_dir = os.path.join(home_dir, "datasets")
model_dir = os.path.join(home_dir, "model_dir")
if not os.path.isdir(model_dir):
  os.mkdir(model_dir)

In [0]:
# Read training articles
def read_articles(article_dir):
  articles = []
  train_dir = os.path.join(data_dir, article_dir)
  for filename in sorted(os.listdir(train_dir)):
    myfile = open(os.path.join(train_dir, filename))
    article = myfile.read()
    articles.append(article)
    myfile.close()
  article_ids = []
  for filename in sorted(os.listdir(train_dir)):
    article_ids.append(filename[7:-4])
  return articles, article_ids

In [0]:
# Read training span labels 
def read_spans():
  spans = []
  label_dir = os.path.join(data_dir, "train-labels-task1-span-identification")
  for filename in sorted(os.listdir(label_dir)):
    myfile = open(os.path.join(label_dir, filename))
    tsvreader = csv.reader(myfile, delimiter="\t")
    span = []
    for row in tsvreader:
      span.append((int(row[1]), int(row[2])))
    myfile.close()
    spans.append(span)
  return spans

In [0]:
def print_spans(article, span):
  for sp in span:
    print (article[sp[0]: sp[1]])
  print()

In [0]:
class example_sentence:
  def __init__(self):
    self.tokens = []
    self.labels = []
    self.article_index = -1 # index of the article to which the sentence is associated
    self.index = -1 # index of the sentence in that article 
    self.word_to_start_char_offset = []
    self.word_to_end_char_offset = []
  
  def __str__(self):
    print("tokens -", self.tokens)
    print("labels -", self.labels)
    print("article_index -", self.article_index)
    print("index -", self.index)
    print("start_offset -", self.word_to_start_char_offset)
    print("end_offset -", self.word_to_end_char_offset)   
    return ""    

In [0]:
def is_whitespace(c):
  if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
    return True
  return False

def get_sentence_tokens_labels(article, span=None, article_index=None):
  doc_tokens = []
  char_to_word_offset = []
  current_sentence_tokens = [] # actually all sentence tokens for particular article. #TODO rename
  word_to_start_char_offset = {}
  word_to_end_char_offset = {}
  prev_is_whitespace = True
  prev_is_newline = True
  current_word_position = None
  for index, c in enumerate(article):
    if c == "\n":
      prev_is_newline = True
      # check for empty lists
      if doc_tokens:
        current_sentence_tokens.append(doc_tokens)
      doc_tokens = []
    if is_whitespace(c):
      prev_is_whitespace = True
      if current_word_position is not None:
        word_to_end_char_offset[current_word_position] = index
        current_word_position = None
    else:
      if prev_is_whitespace:
        doc_tokens.append(c)
        current_word_position = (len(current_sentence_tokens), len(doc_tokens) - 1)
        word_to_start_char_offset[current_word_position] = index # start offset of word
      else:
        doc_tokens[-1] += c
      prev_is_whitespace = False
    char_to_word_offset.append((len(current_sentence_tokens), len(doc_tokens) - 1))
  if doc_tokens:
    current_sentence_tokens.append(doc_tokens)
  if current_word_position is not None:
    word_to_end_char_offset[current_word_position] = index
    current_word_position = None
  if span is None:
    return current_sentence_tokens, (word_to_start_char_offset, word_to_end_char_offset)

  current_propaganda_labels = []
  for doc_tokens in current_sentence_tokens:
    current_propaganda_labels.append([0] * len(doc_tokens))

  start_positions = []
  end_positions = []

  for sp in span:
    if (char_to_word_offset[sp[0]][0] != char_to_word_offset[sp[1]-1][0]):
      l1 = char_to_word_offset[sp[0]][0]
      l2 = char_to_word_offset[sp[1] - 1][0]
      start_positions.append(char_to_word_offset[sp[0]])
      end_positions.append((l1, len(current_sentence_tokens[l1])-1))
      l1 += 1
      while(l1 < l2):
        start_positions.append((l1, 0))
        end_positions.append((l1, len(current_sentence_tokens[l1])-1))
        l1 += 1
      start_positions.append((l2, 0))
      end_positions.append(char_to_word_offset[sp[1]-1])  
      continue
    start_positions.append(char_to_word_offset[sp[0]])
    end_positions.append(char_to_word_offset[sp[1]-1])

  for i, s in enumerate(start_positions):
    assert start_positions[i][0] == end_positions[i][0]
    if TAGGING_SCHEME == "BIO":
      current_propaganda_labels[start_positions[i][0]][start_positions[i][1]] = 2 # Begin label
      if start_positions[i][1] < end_positions[i][1]:
        current_propaganda_labels[start_positions[i][0]][start_positions[i][1] + 1 : end_positions[i][1] + 1] = [1] * (end_positions[i][1] - start_positions[i][1])
    if TAGGING_SCHEME == "BIOE":
      current_propaganda_labels[start_positions[i][0]][start_positions[i][1]] = 2 # Begin label
      if start_positions[i][1] < end_positions[i][1]:
        current_propaganda_labels[start_positions[i][0]][start_positions[i][1] + 1 : end_positions[i][1]] = [1] * (end_positions[i][1] - start_positions[i][1] - 1)
        current_propaganda_labels[start_positions[i][0]][end_positions[i][1]] = 3 # End label
    else:
      current_propaganda_labels[start_positions[i][0]][start_positions[i][1] : end_positions[i][1] + 1] = [1] * (end_positions[i][1] + 1 - start_positions[i][1])
  
  num_sentences = len(current_sentence_tokens)

  start_offset_list = get_list_from_dict(num_sentences, word_to_start_char_offset)
  end_offset_list = get_list_from_dict(num_sentences, word_to_end_char_offset)
  sentences = []
  for i in range(num_sentences):
    sentence = example_sentence()
    sentence.tokens = current_sentence_tokens[i]
    sentence.labels = current_propaganda_labels[i]
    sentence.article_index =  article_index
    sentence.index = i
    sentence.word_to_start_char_offset = start_offset_list[i]
    sentence.word_to_end_char_offset = end_offset_list[i]
    num_words = len(sentence.tokens)
    assert len(sentence.labels) == num_words
    assert len(sentence.word_to_start_char_offset) == num_words
    assert len(sentence.word_to_end_char_offset) == num_words
    sentences.append(sentence)

  return current_sentence_tokens, current_propaganda_labels, (word_to_start_char_offset, word_to_end_char_offset), sentences

In [0]:
def get_list_from_dict(num_sentences, word_offsets):
  li = []
  for _ in range(num_sentences):
    li.append([])
  for key in word_offsets:
    si = key[0]
    li[si].append(word_offsets[key])

  return li

In [0]:
class BertExample:
  def __init__(self):
    self.add_cls_sep = True
    self.sentence_id = -1
    self.orig_to_tok_index = []
    self.tok_to_orig_index = []
    self.labels = None
    self.tokens_ids = []
    self.input_mask = []
  def __str__(self):
    print("sentence_id", self.sentence_id)
    return ""

In [0]:
def convert_sentence_to_input_feature(sentence, sentence_id, tokenizer, add_cls_sep=True, max_seq_len=256):
  bert_example = BertExample()
  bert_example.sentence_id = sentence_id
  bert_example.add_cls_sep = add_cls_sep

  sentence_tokens = sentence.tokens
  sentence_labels = sentence.labels 

  tok_to_orig_index = []
  orig_to_tok_index = []
  all_doc_tokens = [] 
  for (i, token) in enumerate(sentence_tokens):
    orig_to_tok_index.append(len(all_doc_tokens))
    sub_tokens = tokenizer.tokenize(token)
    for sub_token in sub_tokens:
      tok_to_orig_index.append(i)
      all_doc_tokens.append(sub_token)
  bert_example.tok_to_orig_index = tok_to_orig_index
  bert_example.orig_to_tok_index = orig_to_tok_index

  bert_tokens = all_doc_tokens
  if add_cls_sep:
    bert_tokens = ["[CLS]"] + bert_tokens
    bert_tokens = bert_tokens + ["[SEP]"]
  
  tokens_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
  input_mask = [1] * len(tokens_ids)
  while len(tokens_ids) < max_seq_len:
    tokens_ids.append(0)
    input_mask.append(0)
  # tokens_ids = pad_sequences(tokens_ids, maxlen=max_seq_len, truncating="post", padding="post", dtype="int")
  bert_example.tokens_ids = tokens_ids
  bert_example.input_mask = input_mask
  # bert_example.input_mask = [float(i>0) for i in token_ids]

  if sentence_labels is None:
    return bert_example
  

  labels = [0] * len(all_doc_tokens)
  for index, token in enumerate(all_doc_tokens):
    labels[index] = sentence_labels[tok_to_orig_index[index]]
  if add_cls_sep:
    labels = [0] + labels
    labels = labels + [0]
  # labels = pad_sequences(labels, maxlen=max_seq_len, truncating="post", padding="post", dtype="int")
  while len(labels) < max_seq_len:
    labels.append(0)
  bert_example.labels = labels

  return bert_example 


In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def get_dataloader(examples, batch_size=8):
  inputs = torch.tensor([d.tokens_ids for d in examples])
  labels = torch.tensor([d.labels for d in examples])
  masks = torch.tensor([d.input_mask for d in examples])
  sentence_ids = torch.tensor([d.sentence_id for d in examples])
  tensor_data = TensorDataset(inputs, labels, masks, sentence_ids)
  dataloader = DataLoader(tensor_data, batch_size=BATCH_SIZE)
  return dataloader

def get_data(articles, spans, indices):
  assert len(articles) == len(spans)    
  sentences = []
  for index in indices:
    article = articles[index]
    span = spans[index]
    _, _, _, cur_sentences = get_sentence_tokens_labels(article, span, index)
    sentences += cur_sentences
  print(len(sentences))
  print(max([len(s.tokens) for s in sentences]))
  bert_examples = []
  for i, sentence in enumerate(sentences):
    input_feature = convert_sentence_to_input_feature(sentence, i, tokenizer)
    bert_examples.append(input_feature)
  dataloader = get_dataloader(bert_examples, BATCH_SIZE)
  return dataloader, sentences, bert_examples

In [0]:
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=2).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
# class RobertaModel(BertModel):
#     config_class = RobertaConfig
#     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
#     base_model_prefix = "roberta"

#     def __init__(self, config):
#         super().__init__(config)

#         self.embeddings = RobertaEmbeddings(config)
#         self.init_weights()

#     def get_input_embeddings(self):
#         return self.embeddings.word_embeddings


#     def set_input_embeddings(self, value):
#         self.embeddings.word_embeddings = value

In [0]:
# from transformers import BertPreTrainedModel
# import torch.nn as nn
# from torch.nn import CrossEntropyLoss

# class RobertaForTokenClassification(BertPreTrainedModel):
#     config_class = RobertaConfig
#     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
#     base_model_prefix = "roberta"

#     def __init__(self, config):
#         super().__init__(config)
#         self.num_labels = config.num_labels

#         self.roberta = RobertaModel(config)
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.classifier = nn.Linear(config.hidden_size, config.num_labels)

#         self.init_weights()

#     def forward(
#         self,
#         input_ids=None,
#         attention_mask=None,
#         token_type_ids=None,
#         position_ids=None,
#         head_mask=None,
#         inputs_embeds=None,
#         labels=None,
#     ):

#         outputs = self.roberta(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids,
#             position_ids=position_ids,
#             head_mask=head_mask,
#             inputs_embeds=inputs_embeds,
#         )

#         sequence_output = outputs[0]

#         sequence_output = self.dropout(sequence_output)
#         logits = self.classifier(sequence_output)

#         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

#         if labels is not None:
#             loss_fct = CrossEntropyLoss()
#             # Only keep active parts of the loss
#             if attention_mask is not None:
#                 active_loss = attention_mask.view(-1) == 1
#                 active_logits = logits.view(-1, self.num_labels)
#                 active_labels = torch.where(
#                     active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
#                 )
#                 loss = loss_fct(active_logits, active_labels)
#             else:
#                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
#             outputs = (loss,) + outputs

#         return outputs  # (loss), scores, (hidden_states), (attentions)



# class RobertaClassificationHead(nn.Module):
#     """Head for sentence-level classification tasks."""

#     def __init__(self, config):
#         super().__init__()
#         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

#     def forward(self, features, **kwargs):
#         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
#         x = self.dropout(x)
#         x = self.dense(x)
#         x = torch.tanh(x)
#         x = self.dropout(x)
#         x = self.out_proj(x)
#         return x

In [0]:
from tqdm import tqdm, trange
import time
import datetime

def train(model, train_dataloader, eval_dataloader, epochs=5, save_model=False):
  max_grad_norm = 1.0

  for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
      # add batch to gpu
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_labels, b_input_mask, b_ids = batch
      loss, _ = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      loss.backward()
      tr_loss += loss.item()
      nb_tr_examples += b_input_ids.size(0)
      nb_tr_steps += 1
      torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
      optimizer.step()
      model.zero_grad()
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

    get_score(model, mode="train")

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in eval_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_labels, b_input_mask, b_ids = batch
      with torch.no_grad():
        tmp_eval_loss, _ = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      # logits = logits[0]
      # logits = logits.detach().cpu().numpy()
      # label_ids = b_labels.to('cpu').numpy()
      # predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      # true_labels.append(label_ids)
      
      # tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      
      eval_loss += tmp_eval_loss.mean().item()
      # eval_accuracy += tmp_eval_accuracy
      
      nb_eval_examples += b_input_ids.size(0)
      nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))

    get_score(model, mode="eval")
    if save_model:
      model_name = 'model_' + str(datetime.datetime.now()) + '.pt'
      torch.save(model, os.path.join(model_dir, model_name))
      print("Model saved:", model_name)
    print()
    time.sleep(1)
    # print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    # pred_tags = [p_i for p in predictions for p_i in p]
    # valid_tags = [l_ii for l in true_labels for l_i in l for l_ii in l_i]
    # print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))


In [0]:
def get_model_predictions(model, dataloader):
  model.eval()
  predictions , true_labels, sentence_ids = [], [], []
  nb_eval_steps = 0
  for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_labels, b_input_mask, b_ids = batch  
    with torch.no_grad():
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = logits[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    s_ids = b_ids.to('cpu').numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    # print(label_ids)
    true_labels.extend(label_ids)
    sentence_ids.extend(s_ids)
    nb_eval_steps += 1
  
  return predictions, true_labels, sentence_ids

In [0]:
def merge_spans(current_spans):
  if not current_spans:
    return [] 
  merged_spans = []
  li = current_spans[0][0]
  ri = current_spans[0][1]
  threshold = 2
  for i in range(len(current_spans) - 1):
    span = current_spans[i+1]
    if span[0] - ri < 2:
      ri = span[1]
      continue
    else:
      merged_spans.append((li, ri))
      li = span[0]
      ri = span[1]
  merged_spans.append((li, ri))
  return merged_spans

In [0]:
from shutil import copyfile
def get_score(model, mode=None):
  predicted_spans = [[] for i in range(400)] # TODO 400 hardcoded
  
  def get_span_prediction(prediction_labels, sentence_index, sentences, bert_examples):
    index = sentence_index 
    bert_example = bert_examples[index]
    mask = bert_example.input_mask
    pred_labels_masked = prediction_labels # need to change to predictions later
    pred_labels = []
    for i, m in enumerate(mask):
      if m > 0:
        pred_labels.append(pred_labels_masked[i])
    if bert_example.add_cls_sep:
      pred_labels.pop() # remove ['SEP'] label
      pred_labels.pop(0) # remove ['CLS'] label

    sentence = sentences[index]
    sent_len = len(sentence.tokens)
    final_pred_labels = [0] * sent_len
    cur_map = bert_example.tok_to_orig_index
    for i, label in enumerate(pred_labels):
      final_pred_labels[cur_map[i]] |= label
    # assert final_pred_labels == sentence.labels
    
    word_start_index_map = sentence.word_to_start_char_offset
    word_end_index_map = sentence.word_to_end_char_offset

    article_index = sentence.article_index
    for i, label in enumerate(final_pred_labels):
      if label:
        # print(word_start_index_map[i], word_end_index_map[i])
        predicted_spans[article_index].append((word_start_index_map[i], word_end_index_map[i]))
  
  if mode == "train":
    indices = train_indices
    predictions, true_labels, sentence_ids = get_model_predictions(model, train_dataloader)
    pred_sentences, pred_bert_examples = train_sentences, train_bert_examples
  elif mode == "test":
    predictions, true_labels , sentence_ids = get_model_predictions(model, test_dataloader)
    pred_sentences, pred_bert_examples = test_sentences, test_bert_examples
  else:
    indices = eval_indices
    predictions, true_labels, sentence_ids = get_model_predictions(model, eval_dataloader)
    pred_sentences, pred_bert_examples = eval_sentences, eval_bert_examples

  merged_predicted_spans = []
  # TODO sorting of spans???? may not be in order??
  for ii, _ in enumerate(predictions):
    get_span_prediction(predictions[ii], sentence_ids[ii], pred_sentences, pred_bert_examples)
  for span in predicted_spans:
    merged_predicted_spans.append(merge_spans(span))
  if mode == "test":
    return merged_predicted_spans 
  if not os.path.isdir("predictions"):
    os.mkdir("predictions")
  copyfile("gdrive/My Drive/propaganda_detection/tools/task-SI_scorer.py", "predictions/task-SI_scorer.py")
  with open("predictions/predictions.tsv", 'w') as fp:
    for index in indices:
      filename = "article" + article_ids[index] + ".task1-SI.labels"
      copyfile(os.path.join(data_dir, "train-labels-task1-span-identification/" + filename), "predictions/" + filename)
      for ii in merged_predicted_spans[index]:
        fp.write(article_ids[index] + "\t" + str(ii[0]) + "\t" + str(ii[1]) + "\n")

  !python3 predictions/task-SI_scorer.py -s predictions/predictions.tsv -r predictions/ -m

  for index in indices:
    filename = "article" + article_ids[index] + ".task1-SI.labels"
    os.remove("predictions/" + filename)

In [21]:
from transformers import RobertaTokenizer

articles, article_ids = read_articles('train-articles')
spans = read_spans()
TAGGING_SCHEME = "PN" # Positive Negative
# TAGGING_SCHEME = "BIOE"
NUM_ARTICLES = len(articles)
NUM_ARTICLES = 30
articles = articles[0:NUM_ARTICLES]
spans = spans[0:NUM_ARTICLES]
BATCH_SIZE=8
np.random.seed(245)
indices = np.arange(NUM_ARTICLES)
np.random.shuffle(indices)
train_indices = indices[:int(0.9 * NUM_ARTICLES)]
eval_indices = indices[int(0.9 * NUM_ARTICLES):]

bert_model_class = 'bert-base-uncased'
# bert_model_class = "bert-large-uncased-whole-word-masking"
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', lower_case=True)
# tokenizer = BertTokenizer.from_pretrained(bert_model_class, lower_case=True)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', lower_case=True)

bert_model_class
train_dataloader, train_sentences, train_bert_examples = get_data(articles, spans, train_indices)
eval_dataloader, eval_sentences, eval_bert_examples = get_data(articles, spans, eval_indices)


HBox(children=(IntProgress(value=0, description='Downloading', max=898823, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…


988
106
171
53


In [0]:
# from transformers import BertForTokenClassification
from transformers import RobertaForTokenClassification

num_labels = 2 + int(TAGGING_SCHEME == "BIO") + 2 * int(TAGGING_SCHEME == "BIOE")
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=num_labels)
model.cuda()

if TAGGING_SCHEME == "BIOE":
  WEIGHTS = torch.tensor([1.0, 5.0, 10.0, 5.0]).cuda()
else:
  WEIGHTS = torch.tensor([1.0, 1.0]).cuda()

from torch.optim import Adam
FULL_FINETUNING = False 
if FULL_FINETUNING:
  param_optimizer = list(model.named_parameters())
  # no_decay = ['bias', 'gamma', 'beta']
  no_decay = ["bias", "LayerNorm.weight"]
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
  ]
else:
  param_optimizer = list(model.classifier.named_parameters()) 
  optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5) # lr 3e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, epochs=5, save_model=(NUM_ARTICLES >= 1150))

HBox(children=(IntProgress(value=0, description='Downloading', max=524, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=501200538, style=ProgressStyle(description_…




Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 0.4164031496451747
2020-04-12 20:05:42,290 - INFO - Checking user submitted file
2020-04-12 20:05:42,294 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:05:42,294 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-04-12 20:05:42,294 - INFO - F1=0.000000
Validation loss: 0.32442435351285065
2020-04-12 20:05:49,724 - INFO - Checking user submitted file
2020-04-12 20:05:49,725 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:05:49,725 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:05:49,725 - INFO - F1=0.000000



Epoch:  20%|██        | 1/5 [00:46<03:06, 46.65s/it]

Train loss: 0.3762930757816761
2020-04-12 20:06:27,654 - INFO - Checking user submitted file
2020-04-12 20:06:27,658 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:06:27,658 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-04-12 20:06:27,658 - INFO - F1=0.000000
Validation loss: 0.2823106064037843
2020-04-12 20:06:34,863 - INFO - Checking user submitted file
2020-04-12 20:06:34,864 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:06:34,864 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:06:34,864 - INFO - F1=0.000000



Epoch:  40%|████      | 2/5 [01:32<02:19, 46.48s/it]

Train loss: 0.3555475573145574
2020-04-12 20:07:14,108 - INFO - Checking user submitted file
2020-04-12 20:07:14,112 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:07:14,112 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-04-12 20:07:14,112 - INFO - F1=0.000000
Validation loss: 0.2568835717710582
2020-04-12 20:07:21,667 - INFO - Checking user submitted file
2020-04-12 20:07:21,668 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:07:21,668 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:07:21,668 - INFO - F1=0.000000



Epoch:  60%|██████    | 3/5 [02:19<01:32, 46.49s/it]

Train loss: 0.34651527049080016
2020-04-12 20:08:00,766 - INFO - Checking user submitted file
2020-04-12 20:08:00,770 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:08:00,770 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-04-12 20:08:00,771 - INFO - F1=0.000000
Validation loss: 0.2401843633164059
2020-04-12 20:08:08,865 - INFO - Checking user submitted file
2020-04-12 20:08:08,866 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:08:08,866 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:08:08,866 - INFO - F1=0.000000



Epoch:  80%|████████  | 4/5 [03:05<00:46, 46.53s/it]

Train loss: 0.3384283809892593


In [0]:
train(model, train_dataloader, eval_dataloader, epochs=5, save_model=(NUM_ARTICLES >= 1150))

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 0.3408269793275864
2020-02-29 18:29:52,127 - INFO - Checking user submitted file
2020-02-29 18:29:52,131 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:29:52,131 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-02-29 18:29:52,131 - INFO - F1=0.000000
Validation loss: 0.23104540597308765
2020-02-29 18:30:04,765 - INFO - Checking user submitted file
2020-02-29 18:30:04,765 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:30:04,765 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-02-29 18:30:04,766 - INFO - F1=0.000000



Epoch:  20%|██        | 1/5 [01:22<05:28, 82.15s/it]

Train loss: 0.3383904413710679
2020-02-29 18:31:13,032 - INFO - Checking user submitted file
2020-02-29 18:31:13,036 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:31:13,036 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-02-29 18:31:13,036 - INFO - F1=0.000000
Validation loss: 0.2233789214356379
2020-02-29 18:31:23,272 - INFO - Checking user submitted file
2020-02-29 18:31:23,272 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:31:23,273 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-02-29 18:31:23,273 - INFO - F1=0.000000



Epoch:  40%|████      | 2/5 [02:36<03:59, 79.75s/it]

Train loss: 0.3321099298134927
2020-02-29 18:32:26,758 - INFO - Checking user submitted file
2020-02-29 18:32:26,762 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:32:26,762 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-02-29 18:32:26,762 - INFO - F1=0.000000
Validation loss: 0.21720669249242003
2020-02-29 18:32:36,795 - INFO - Checking user submitted file
2020-02-29 18:32:36,796 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:32:36,796 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-02-29 18:32:36,796 - INFO - F1=0.000000



Epoch:  60%|██████    | 3/5 [03:49<02:35, 77.73s/it]

Train loss: 0.3315514555020678
2020-02-29 18:33:39,491 - INFO - Checking user submitted file
2020-02-29 18:33:39,496 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:33:39,496 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-02-29 18:33:39,496 - INFO - F1=0.000000
Validation loss: 0.21274282478473402
2020-02-29 18:33:51,389 - INFO - Checking user submitted file
2020-02-29 18:33:51,390 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:33:51,390 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-02-29 18:33:51,390 - INFO - F1=0.000000



Epoch:  80%|████████  | 4/5 [05:06<01:17, 77.58s/it]

Train loss: 0.33211116943388214
2020-02-29 18:34:56,982 - INFO - Checking user submitted file
2020-02-29 18:34:56,986 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:34:56,986 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-02-29 18:34:56,986 - INFO - F1=0.000000
Validation loss: 0.20956323261965404
2020-02-29 18:35:12,562 - INFO - Checking user submitted file
2020-02-29 18:35:12,563 - INFO - Scoring the submission with precision and recall method
2020-02-29 18:35:12,563 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-02-29 18:35:12,563 - INFO - F1=0.000000



Epoch: 100%|██████████| 5/5 [06:27<00:00, 78.67s/it]


In [0]:
tdevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load(os.path.join(model_dir, 'model_2020-02-25 20:05:27.163346.pt'))
# get_score(model, mode="train")
# print()
get_score(model, mode="eval")

2020-02-26 07:44:03,409 - INFO - Checking user submitted file
2020-02-26 07:44:03,425 - INFO - Scoring the submission with precision and recall method
2020-02-26 07:44:03,598 - INFO - Precision=228.062968/434=0.525491	Recall=245.284300/607=0.404093
2020-02-26 07:44:03,599 - INFO - F1=0.456865


In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ind = 1
model = torch.load(os.path.join(model_dir, 'model_370_44_bioe.pt'))

test_articles = [articles[ind]]
test_spans = [[]] * len(test_articles)

test_dataloader, test_sentences, test_bert_examples = get_data(test_articles, test_spans, indices=np.arange(len(test_articles)))
sps = get_score(model, mode="test")
print_spans(articles[ind], sps[0])
print('--' * 50)
print_spans(articles[ind], spans[ind])

28
38
Stop Islamization of America.
"We condemn all those whose behaviours and views run counter to our shared values
"Defeat Jihad"
"a striking blow against freedom"
the "the nation that gave the world the Magna Carta is dead".
The UK should never become a stage for inflammatory speakers who promote hate."
for this so-called land of democracy and freedom of speech,"
hate preachers
hatred'
"delighted"
most
in the world.

----------------------------------------------------------------------------------------------------
Stop Islamization of America
We condemn all those whose behaviours and views run counter to our shared values
Defeat Jihad"
the nation that gave the world the Magna Carta is dead"
The UK should never become a stage for inflammatory speakers who promote hate."
hate preachers
delighted" with the decision
a striking blow against freedom
so-called land of democracy and freedom of speech



In [0]:
ind = 32
model = torch.load(os.path.join(model_dir, 'model_bioe_weighted.pt'))

test_articles, test_article_ids = read_articles('dev-articles')
# test_articles = [articles[ind]]
test_spans = [[]] * len(test_articles)

test_dataloader, test_sentences, test_bert_examples = get_data(test_articles, test_spans, indices=np.arange(len(test_articles)))
sps = get_score(model, mode="test")
# print_spans(articles[ind], sps[0])
# print('--' * 50)
# print_spans(articles[ind], spans[ind])

3177
103


In [0]:
from google.colab import files
with open('predictions/dev_predictions.txt', 'w') as fp:
  for index in range(len(test_articles)):
    for ii in sps[index]:
      fp.write(test_article_ids[index] + "\t" + str(ii[0]) + "\t" + str(ii[1]) + "\n")
files.download('predictions/dev_predictions.txt')

In [0]:
# torch.save(model, os.path.join(model_dir, 'model_300_42.pt'))


In [0]:
for index in eval_indices:
  print("article index:",index)
  span = merged_predicted_spans[index]
  print_spans(articles[index], span)
  print()
  print_spans(articles[index], spans[index])
  print("--" * 50)

article index: 107
“sort of in a panic”
that someone might “break into his apartment” looking for it, like “that Watergate crap.”
it,”
black-owned businesses.
institutions
“spiritual advisor”
Reading Obama’s 1995 memoir, you might almost get the impression that after a prudent first term, during his second he might side with, I dunno, Black Lives Matter and encourage a wave of black rage and police retreat that drove up the death toll from murder by 20% in his last two years in office, an incremental death toll a little bigger than the U.S. combat death toll from the equally stupid Iraq War.


spiritual advisor
sort of in a panic
black
shaking down white institutions
Reading Obama’s 1995 memoir, you might almost get the impression that after a prudent first term, during his second he might side with, I dunno, Black Lives Matter and encourage a wave of black rage and police retreat that drove up the death toll from murder by 20% in his last two years in office, an incremental death toll

In [0]:
import spacy

In [0]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text,  token.pos_)

Apple PROPN
is VERB
looking VERB
at ADP
buying VERB
U.K. PROPN
startup NOUN
for ADP
$ SYM
1 NUM
billion NUM


#GPT-2

In [0]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2PreTrainedModel
import torch
import torch.nn as nn

In [0]:
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# text = "What is the fastest runningggggggggg car in the"
# indexed_tokens = tokenizer.encode(text)
# print(indexed_tokens)
# print(tokenizer.decode([2491]))
# tokens_tensor = torch.tensor([indexed_tokens])
# print(tokens_tensor)

In [0]:
class GPT2ForTokenClassification(GPT2PreTrainedModel):
  def __init__(self,config):
    super().__init__(config)
    self.num_labels = config.num_labels

    self.gpt2 = GPT2Model(config)
    self.dropout = nn.Dropout(config.resid_pdrop)
    self.classifier = nn.Linear(config.n_embd, config.num_labels)
    self.init_weights()
  
  def forward(self,input_ids=None,
              attention_mask=None,
              token_type_ids = None,
              position_ids = None,
              head_mask = None,
              input_embeds = None,
              labels = None,
              ):
    output = self.gpt2(
        input_ids,
        attention_mask = attention_mask,
        token_type_ids = token_type_ids,
        position_ids = position_ids,
        head_mask = head_mask,
        inputs_embeds = inputs_embeds, 
    )

    sequence_output = outputs[0]
    sequence_output = self.dropout(sequence_output)
    logits = self.classifier(sequence_output)
    outputs = (logits,) + output[2:]
    if labels is not None:
      loss_fct =  CrossEntropyLoss()

      if attention_mask is not None:
        active_loss = attention_mask.view(-1)==1
        active_logits = logits.view(-1, self.num_labels)[active_loss]
        active_labels = labels.view(-1)[active_loss]
        loss = loss_fct(active_logits, active_labels)
      else:
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
      outputs = (loss,) + outputs
    
    return outputs
        
    


In [0]:
model = GPT2ForTokenClassification.from_pretrained('gpt2', num_labels=2)

In [0]:
model = model.cuda()
print(model)

GPT2ForTokenClassification(
  (gpt2): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): 

In [0]:
gpt2tokenizer = GPT2Tokenizer.from_pretrained('gpt2')