In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 19.3MB/s eta 0:00:01[K     |█▏                              | 20kB 4.3MB/s eta 0:00:01[K     |█▊                              | 30kB 6.1MB/s eta 0:00:01[K     |██▎                             | 40kB 7.8MB/s eta 0:00:01[K     |███                             | 51kB 5.0MB/s eta 0:00:01[K     |███▌                            | 61kB 5.8MB/s eta 0:00:01[K     |████                            | 71kB 6.6MB/s eta 0:00:01[K     |████▋                           | 81kB 7.3MB/s eta 0:00:01[K     |█████▎                          | 92kB 5.9MB/s eta 0:00:01[K     |█████▉                          | 102kB 6.4MB/s eta 0:00:01[K     |██████▍                         | 112kB 6.4MB/s eta 0:00:01[K     |███████                         | 122kB 6.4M

In [0]:
import os
import sys
import torch
import csv
import numpy as np

In [0]:
home_dir = "gdrive/My Drive/propaganda_detection"
data_dir = os.path.join(home_dir, "datasets")
model_dir = os.path.join(home_dir, "model_dir")
if not os.path.isdir(model_dir):
  os.mkdir(model_dir)

In [0]:
# Read training articles
def read_articles(article_dir):
  articles = []
  train_dir = os.path.join(data_dir, article_dir)
  for filename in sorted(os.listdir(train_dir)):
    myfile = open(os.path.join(train_dir, filename))
    article = myfile.read()
    articles.append(article)
    myfile.close()
  article_ids = []
  for filename in sorted(os.listdir(train_dir)):
    article_ids.append(filename[7:-4])
  return articles, article_ids

In [0]:
# Read training span labels 
def read_spans():
  spans = []
  label_dir = os.path.join(data_dir, "train-labels-task1-span-identification")
  for filename in sorted(os.listdir(label_dir)):
    myfile = open(os.path.join(label_dir, filename))
    tsvreader = csv.reader(myfile, delimiter="\t")
    span = []
    for row in tsvreader:
      span.append((int(row[1]), int(row[2])))
    myfile.close()
    spans.append(span)
  return spans

In [0]:
def print_spans(article, span):
  for sp in span:
    print (article[sp[0]: sp[1]])
  print()

In [0]:
class example_sentence:
  def __init__(self):
    self.tokens = []
    self.labels = []
    self.article_index = -1 # index of the article to which the sentence is associated
    self.index = -1 # index of the sentence in that article 
    self.word_to_start_char_offset = []
    self.word_to_end_char_offset = []
  
  def __str__(self):
    print("tokens -", self.tokens)
    print("labels -", self.labels)
    print("article_index -", self.article_index)
    print("index -", self.index)
    print("start_offset -", self.word_to_start_char_offset)
    print("end_offset -", self.word_to_end_char_offset)   
    return ""    

In [0]:
def is_whitespace(c):
  if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
    return True
  return False

def get_sentence_tokens_labels(article, span=None, article_index=None):
  doc_tokens = []
  char_to_word_offset = []
  current_sentence_tokens = [] # actually all sentence tokens for particular article. #TODO rename
  word_to_start_char_offset = {}
  word_to_end_char_offset = {}
  prev_is_whitespace = True
  prev_is_newline = True
  current_word_position = None
  for index, c in enumerate(article):
    if c == "\n":
      prev_is_newline = True
      # check for empty lists
      if doc_tokens:
        current_sentence_tokens.append(doc_tokens)
      doc_tokens = []
    if is_whitespace(c):
      prev_is_whitespace = True
      if current_word_position is not None:
        word_to_end_char_offset[current_word_position] = index
        current_word_position = None
    else:
      if prev_is_whitespace:
        doc_tokens.append(c)
        current_word_position = (len(current_sentence_tokens), len(doc_tokens) - 1)
        word_to_start_char_offset[current_word_position] = index # start offset of word
      else:
        doc_tokens[-1] += c
      prev_is_whitespace = False
    char_to_word_offset.append((len(current_sentence_tokens), len(doc_tokens) - 1))
  if doc_tokens:
    current_sentence_tokens.append(doc_tokens)
  if current_word_position is not None:
    word_to_end_char_offset[current_word_position] = index
    current_word_position = None
  if span is None:
    return current_sentence_tokens, (word_to_start_char_offset, word_to_end_char_offset)

  current_propaganda_labels = []
  for doc_tokens in current_sentence_tokens:
    current_propaganda_labels.append([0] * len(doc_tokens))

  start_positions = []
  end_positions = []

  for sp in span:
    if (char_to_word_offset[sp[0]][0] != char_to_word_offset[sp[1]-1][0]):
      l1 = char_to_word_offset[sp[0]][0]
      l2 = char_to_word_offset[sp[1] - 1][0]
      start_positions.append(char_to_word_offset[sp[0]])
      end_positions.append((l1, len(current_sentence_tokens[l1])-1))
      l1 += 1
      while(l1 < l2):
        start_positions.append((l1, 0))
        end_positions.append((l1, len(current_sentence_tokens[l1])-1))
        l1 += 1
      start_positions.append((l2, 0))
      end_positions.append(char_to_word_offset[sp[1]-1])  
      continue
    start_positions.append(char_to_word_offset[sp[0]])
    end_positions.append(char_to_word_offset[sp[1]-1])

  for i, s in enumerate(start_positions):
    assert start_positions[i][0] == end_positions[i][0]
    if TAGGING_SCHEME == "BIO":
      current_propaganda_labels[start_positions[i][0]][start_positions[i][1]] = 2 # Begin label
      if start_positions[i][1] < end_positions[i][1]:
        current_propaganda_labels[start_positions[i][0]][start_positions[i][1] + 1 : end_positions[i][1] + 1] = [1] * (end_positions[i][1] - start_positions[i][1])
    if TAGGING_SCHEME == "BIOE":
      current_propaganda_labels[start_positions[i][0]][start_positions[i][1]] = 2 # Begin label
      if start_positions[i][1] < end_positions[i][1]:
        current_propaganda_labels[start_positions[i][0]][start_positions[i][1] + 1 : end_positions[i][1]] = [1] * (end_positions[i][1] - start_positions[i][1] - 1)
        current_propaganda_labels[start_positions[i][0]][end_positions[i][1]] = 3 # End label
    else:
      current_propaganda_labels[start_positions[i][0]][start_positions[i][1] : end_positions[i][1] + 1] = [1] * (end_positions[i][1] + 1 - start_positions[i][1])
  
  num_sentences = len(current_sentence_tokens)

  start_offset_list = get_list_from_dict(num_sentences, word_to_start_char_offset)
  end_offset_list = get_list_from_dict(num_sentences, word_to_end_char_offset)
  sentences = []
  for i in range(num_sentences):
    sentence = example_sentence()
    sentence.tokens = current_sentence_tokens[i]
    sentence.labels = current_propaganda_labels[i]
    sentence.article_index =  article_index
    sentence.index = i
    sentence.word_to_start_char_offset = start_offset_list[i]
    sentence.word_to_end_char_offset = end_offset_list[i]
    num_words = len(sentence.tokens)
    assert len(sentence.labels) == num_words
    assert len(sentence.word_to_start_char_offset) == num_words
    assert len(sentence.word_to_end_char_offset) == num_words
    sentences.append(sentence)

  return current_sentence_tokens, current_propaganda_labels, (word_to_start_char_offset, word_to_end_char_offset), sentences

In [0]:
def get_list_from_dict(num_sentences, word_offsets):
  li = []
  for _ in range(num_sentences):
    li.append([])
  for key in word_offsets:
    si = key[0]
    li[si].append(word_offsets[key])

  return li

In [0]:
class BertExample:
  def __init__(self):
    self.add_cls_sep = True
    self.sentence_id = -1
    self.orig_to_tok_index = []
    self.tok_to_orig_index = []
    self.labels = None
    self.tokens_ids = []
    self.input_mask = []
  def __str__(self):
    print("sentence_id", self.sentence_id)
    return ""

In [0]:
def convert_sentence_to_input_feature(sentence, sentence_id, tokenizer, add_cls_sep=True, max_seq_len=256):
  bert_example = BertExample()
  bert_example.sentence_id = sentence_id
  bert_example.add_cls_sep = add_cls_sep

  sentence_tokens = sentence.tokens
  sentence_labels = sentence.labels 

  tok_to_orig_index = []
  orig_to_tok_index = []
  all_doc_tokens = [] 
  for (i, token) in enumerate(sentence_tokens):
    orig_to_tok_index.append(len(all_doc_tokens))
    sub_tokens = tokenizer.tokenize(token)
    for sub_token in sub_tokens:
      tok_to_orig_index.append(i)
      all_doc_tokens.append(sub_token)
  bert_example.tok_to_orig_index = tok_to_orig_index
  bert_example.orig_to_tok_index = orig_to_tok_index

  bert_tokens = all_doc_tokens
  if add_cls_sep:
    bert_tokens = ["[CLS]"] + bert_tokens
    bert_tokens = bert_tokens + ["[SEP]"]
  
  tokens_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
  input_mask = [1] * len(tokens_ids)
  while len(tokens_ids) < max_seq_len:
    tokens_ids.append(0)
    input_mask.append(0)
  # tokens_ids = pad_sequences(tokens_ids, maxlen=max_seq_len, truncating="post", padding="post", dtype="int")
  bert_example.tokens_ids = tokens_ids
  bert_example.input_mask = input_mask
  # bert_example.input_mask = [float(i>0) for i in token_ids]

  if sentence_labels is None:
    return bert_example
  

  labels = [0] * len(all_doc_tokens)
  for index, token in enumerate(all_doc_tokens):
    labels[index] = sentence_labels[tok_to_orig_index[index]]
  if add_cls_sep:
    labels = [0] + labels
    labels = labels + [0]
  # labels = pad_sequences(labels, maxlen=max_seq_len, truncating="post", padding="post", dtype="int")
  while len(labels) < max_seq_len:
    labels.append(0)
  bert_example.labels = labels

  return bert_example 


In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer

def get_dataloader(examples, batch_size=8):
  inputs = torch.tensor([d.tokens_ids for d in examples])
  labels = torch.tensor([d.labels for d in examples])
  masks = torch.tensor([d.input_mask for d in examples])
  sentence_ids = torch.tensor([d.sentence_id for d in examples])
  tensor_data = TensorDataset(inputs, labels, masks, sentence_ids)
  dataloader = DataLoader(tensor_data, batch_size=BATCH_SIZE)
  return dataloader

def get_data(articles, spans, indices):
  assert len(articles) == len(spans)    
  sentences = []
  for index in indices:
    article = articles[index]
    span = spans[index]
    _, _, _, cur_sentences = get_sentence_tokens_labels(article, span, index)
    sentences += cur_sentences
  print(len(sentences))
  print(max([len(s.tokens) for s in sentences]))
  bert_examples = []
  for i, sentence in enumerate(sentences):
    input_feature = convert_sentence_to_input_feature(sentence, i, tokenizer)
    bert_examples.append(input_feature)
  dataloader = get_dataloader(bert_examples, BATCH_SIZE)
  return dataloader, sentences, bert_examples

In [0]:
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=2).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
from transformers import DistilBertPreTrainedModel, DistilBertModel
import torch.nn as nn
from torch.nn import CrossEntropyLoss

class CustomBertForTokenClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.my_hidden_size = 128
        self.bert = DistilBertModel(config)
        self.dropout = nn.Dropout(0)
        # self.my_hidden = nn.Linear(config.hidden_size, self.my_hidden_size)
        # self.classifier = nn.Linear(self.my_hidden_size, config.num_labels)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        # logits = self.my_hidden(sequence_output)
        # logits = self.classifier(logits)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)


In [0]:
from tqdm import tqdm, trange
import time
import datetime

def train(model, train_dataloader, eval_dataloader, epochs=5, save_model=False):
  max_grad_norm = 1.0

  for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
      # add batch to gpu
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_labels, b_input_mask, b_ids = batch
      loss, _ = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      loss.backward()
      tr_loss += loss.item()
      nb_tr_examples += b_input_ids.size(0)
      nb_tr_steps += 1
      torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
      optimizer.step()
      model.zero_grad()
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

    get_score(model, mode="train")

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in eval_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_labels, b_input_mask, b_ids = batch
      with torch.no_grad():
        tmp_eval_loss, _ = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      # logits = logits[0]
      # logits = logits.detach().cpu().numpy()
      # label_ids = b_labels.to('cpu').numpy()
      # predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      # true_labels.append(label_ids)
      
      # tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      
      eval_loss += tmp_eval_loss.mean().item()
      # eval_accuracy += tmp_eval_accuracy
      
      nb_eval_examples += b_input_ids.size(0)
      nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))

    get_score(model, mode="eval")
    if save_model:
      model_name = 'model_' + str(datetime.datetime.now()) + '.pt'
      torch.save(model, os.path.join(model_dir, model_name))
      print("Model saved:", model_name)
    print()
    time.sleep(1)
    # print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    # pred_tags = [p_i for p in predictions for p_i in p]
    # valid_tags = [l_ii for l in true_labels for l_i in l for l_ii in l_i]
    # print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))


In [0]:
def get_model_predictions(model, dataloader):
  model.eval()
  predictions , true_labels, sentence_ids = [], [], []
  nb_eval_steps = 0
  for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_labels, b_input_mask, b_ids = batch  
    with torch.no_grad():
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = logits[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    s_ids = b_ids.to('cpu').numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    # print(label_ids)
    true_labels.extend(label_ids)
    sentence_ids.extend(s_ids)
    nb_eval_steps += 1
  
  return predictions, true_labels, sentence_ids

In [0]:
def merge_spans(current_spans):
  if not current_spans:
    return [] 
  merged_spans = []
  li = current_spans[0][0]
  ri = current_spans[0][1]
  threshold = 2
  for i in range(len(current_spans) - 1):
    span = current_spans[i+1]
    if span[0] - ri < 2:
      ri = span[1]
      continue
    else:
      merged_spans.append((li, ri))
      li = span[0]
      ri = span[1]
  merged_spans.append((li, ri))
  return merged_spans

In [0]:
from shutil import copyfile
def get_score(model, mode=None):
  predicted_spans = [[] for i in range(400)] # TODO 400 hardcoded
  
  def get_span_prediction(prediction_labels, sentence_index, sentences, bert_examples):
    index = sentence_index 
    bert_example = bert_examples[index]
    mask = bert_example.input_mask
    pred_labels_masked = prediction_labels # need to change to predictions later
    pred_labels = []
    for i, m in enumerate(mask):
      if m > 0:
        pred_labels.append(pred_labels_masked[i])
    if bert_example.add_cls_sep:
      pred_labels.pop() # remove ['SEP'] label
      pred_labels.pop(0) # remove ['CLS'] label

    sentence = sentences[index]
    sent_len = len(sentence.tokens)
    final_pred_labels = [0] * sent_len
    cur_map = bert_example.tok_to_orig_index
    for i, label in enumerate(pred_labels):
      final_pred_labels[cur_map[i]] |= label
    # assert final_pred_labels == sentence.labels
    
    word_start_index_map = sentence.word_to_start_char_offset
    word_end_index_map = sentence.word_to_end_char_offset

    article_index = sentence.article_index
    for i, label in enumerate(final_pred_labels):
      if label:
        # print(word_start_index_map[i], word_end_index_map[i])
        predicted_spans[article_index].append((word_start_index_map[i], word_end_index_map[i]))
  
  if mode == "train":
    indices = train_indices
    predictions, true_labels, sentence_ids = get_model_predictions(model, train_dataloader)
    pred_sentences, pred_bert_examples = train_sentences, train_bert_examples
  elif mode == "test":
    predictions, true_labels , sentence_ids = get_model_predictions(model, test_dataloader)
    pred_sentences, pred_bert_examples = test_sentences, test_bert_examples
  else:
    indices = eval_indices
    predictions, true_labels, sentence_ids = get_model_predictions(model, eval_dataloader)
    pred_sentences, pred_bert_examples = eval_sentences, eval_bert_examples

  merged_predicted_spans = []
  # TODO sorting of spans???? may not be in order??
  for ii, _ in enumerate(predictions):
    get_span_prediction(predictions[ii], sentence_ids[ii], pred_sentences, pred_bert_examples)
  for span in predicted_spans:
    merged_predicted_spans.append(merge_spans(span))
  if mode == "test":
    return merged_predicted_spans 
  if not os.path.isdir("predictions"):
    os.mkdir("predictions")
  copyfile("gdrive/My Drive/propaganda_detection/tools/task-SI_scorer.py", "predictions/task-SI_scorer.py")
  with open("predictions/predictions.tsv", 'w') as fp:
    for index in indices:
      filename = "article" + article_ids[index] + ".task1-SI.labels"
      copyfile(os.path.join(data_dir, "train-labels-task1-span-identification/" + filename), "predictions/" + filename)
      for ii in merged_predicted_spans[index]:
        fp.write(article_ids[index] + "\t" + str(ii[0]) + "\t" + str(ii[1]) + "\n")

  !python3 predictions/task-SI_scorer.py -s predictions/predictions.tsv -r predictions/ -m

  for index in indices:
    filename = "article" + article_ids[index] + ".task1-SI.labels"
    os.remove("predictions/" + filename)

In [26]:
from transformers import DistilBertTokenizer

articles, article_ids = read_articles('train-articles')
spans = read_spans()
TAGGING_SCHEME = "PN" # Positive Negative
# TAGGING_SCHEME = "BIOE"
NUM_ARTICLES = len(articles)
NUM_ARTICLES = 30
articles = articles[0:NUM_ARTICLES]
spans = spans[0:NUM_ARTICLES]
BATCH_SIZE=8
np.random.seed(245)
indices = np.arange(NUM_ARTICLES)
np.random.shuffle(indices)
train_indices = indices[:int(0.9 * NUM_ARTICLES)]
eval_indices = indices[int(0.9 * NUM_ARTICLES):]

bert_model_class = 'bert-base-uncased'
# bert_model_class = "bert-large-uncased-whole-word-masking"
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', lower_case=True)
# tokenizer = BertTokenizer.from_pretrained(bert_model_class, lower_case=True)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_dataloader, train_sentences, train_bert_examples = get_data(articles, spans, train_indices)
eval_dataloader, eval_sentences, eval_bert_examples = get_data(articles, spans, eval_indices)


988
106
171
53


In [27]:
# from transformers import BertForTokenClassification

num_labels = 2 + int(TAGGING_SCHEME == "BIO") + 2 * int(TAGGING_SCHEME == "BIOE")
model = CustomBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
model.cuda()

from torch.optim import Adam
FULL_FINETUNING = True 
if FULL_FINETUNING:
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
  ]
else:
  param_optimizer = list(model.classifier.named_parameters()) 
  optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-6) # lr 3e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, epochs=6, save_model=(NUM_ARTICLES >= 150))

Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Train loss: 0.351732685249449
2020-04-12 20:27:09,262 - INFO - Checking user submitted file
2020-04-12 20:27:09,267 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:27:09,267 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/227=0.000000
2020-04-12 20:27:09,267 - INFO - F1=0.000000
Validation loss: 0.21459426006979562
2020-04-12 20:27:14,840 - INFO - Checking user submitted file
2020-04-12 20:27:14,841 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:27:14,841 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:27:14,842 - INFO - F1=0.000000



Epoch:  17%|█▋        | 1/6 [00:30<02:31, 30.36s/it]

Train loss: 0.30729312141756376
2020-04-12 20:27:39,784 - INFO - Checking user submitted file
2020-04-12 20:27:39,790 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:27:39,791 - INFO - Precision=3.000000/3=1.000000	Recall=0.570238/227=0.002512
2020-04-12 20:27:39,791 - INFO - F1=0.005012
Validation loss: 0.2032794359732758
2020-04-12 20:27:45,472 - INFO - Checking user submitted file
2020-04-12 20:27:45,473 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:27:45,473 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:27:45,473 - INFO - F1=0.000000



Epoch:  33%|███▎      | 2/6 [01:01<02:01, 30.49s/it]

Train loss: 0.26182244942455946
2020-04-12 20:28:10,401 - INFO - Checking user submitted file
2020-04-12 20:28:10,407 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:28:10,424 - INFO - Precision=79.084017/95=0.832463	Recall=18.997473/227=0.083689
2020-04-12 20:28:10,424 - INFO - F1=0.152089
Validation loss: 0.2415089719632471
2020-04-12 20:28:15,855 - INFO - Checking user submitted file
2020-04-12 20:28:15,856 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:28:15,856 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:28:15,856 - INFO - F1=0.000000



Epoch:  50%|█████     | 3/6 [01:31<01:31, 30.42s/it]

Train loss: 0.2061470644647113
2020-04-12 20:28:40,968 - INFO - Checking user submitted file
2020-04-12 20:28:40,974 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:28:41,020 - INFO - Precision=242.730880/275=0.882658	Recall=66.707874/227=0.293867
2020-04-12 20:28:41,020 - INFO - F1=0.440933
Validation loss: 0.24836194788275118
2020-04-12 20:28:47,278 - INFO - Checking user submitted file
2020-04-12 20:28:47,279 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:28:47,280 - INFO - Precision=0.000000/7=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:28:47,280 - INFO - F1=0.000000



Epoch:  67%|██████▋   | 4/6 [02:03<01:01, 30.86s/it]

Train loss: 0.16836300136369134
2020-04-12 20:29:12,533 - INFO - Checking user submitted file
2020-04-12 20:29:12,539 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:29:12,550 - INFO - Precision=38.708602/41=0.944112	Recall=9.285557/227=0.040906
2020-04-12 20:29:12,550 - INFO - F1=0.078414
Validation loss: 0.31536673920610075
2020-04-12 20:29:18,189 - INFO - Checking user submitted file
2020-04-12 20:29:18,190 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:29:18,190 - INFO - Precision=0.000000/0=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:29:18,191 - INFO - F1=0.000000



Epoch:  83%|████████▎ | 5/6 [02:33<00:30, 30.61s/it]

Train loss: 0.14685817019788724
2020-04-12 20:29:44,006 - INFO - Checking user submitted file
2020-04-12 20:29:44,012 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:29:44,038 - INFO - Precision=153.943406/163=0.944438	Recall=27.854728/227=0.122708
2020-04-12 20:29:44,039 - INFO - F1=0.217196
Validation loss: 0.37780955646568065
2020-04-12 20:29:50,441 - INFO - Checking user submitted file
2020-04-12 20:29:50,442 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:29:50,443 - INFO - Precision=0.000000/1=0.000000	Recall=0.000000/34=0.000000
2020-04-12 20:29:50,443 - INFO - F1=0.000000



Epoch: 100%|██████████| 6/6 [03:06<00:00, 31.06s/it]


In [24]:
train(model, train_dataloader, eval_dataloader, epochs=2, save_model=(NUM_ARTICLES >= 150))

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Train loss: 0.17941840960812636
2020-04-12 20:22:31,153 - INFO - Checking user submitted file
2020-04-12 20:22:31,156 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:22:31,178 - INFO - Precision=77.250554/181=0.426799	Recall=52.633456/66=0.797477
2020-04-12 20:22:31,178 - INFO - F1=0.556022
Validation loss: 0.3921118378639221
2020-04-12 20:22:34,975 - INFO - Checking user submitted file
2020-04-12 20:22:34,975 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:22:34,975 - INFO - Precision=0.000000/3=0.000000	Recall=0.000000/1=0.000000
2020-04-12 20:22:34,975 - INFO - F1=0.000000



Epoch:  50%|█████     | 1/2 [00:13<00:13, 13.79s/it]

Train loss: 0.1594885269532273
2020-04-12 20:22:44,883 - INFO - Checking user submitted file
2020-04-12 20:22:44,888 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:22:44,927 - INFO - Precision=43.971136/274=0.160479	Recall=64.984852/66=0.984619
2020-04-12 20:22:44,927 - INFO - F1=0.275977
Validation loss: 0.44500982761383057
2020-04-12 20:22:48,810 - INFO - Checking user submitted file
2020-04-12 20:22:48,811 - INFO - Scoring the submission with precision and recall method
2020-04-12 20:22:48,811 - INFO - Precision=0.000000/7=0.000000	Recall=0.000000/1=0.000000
2020-04-12 20:22:48,811 - INFO - F1=0.000000



Epoch: 100%|██████████| 2/2 [00:27<00:00, 13.94s/it]


In [0]:
torch.save(model.state_dict(), os.path.join(model_dir, 'span_state_dict.pt'))

In [0]:
 Ntorch.save(model, os.path.join(model_dir, 'model_bioe_weighted_deep2.pt'))
# model = torch.load(os.path.join(model_dir, 'model_2020-02-28 22:05:59.981128.pt'))
# model.cuda()

# WEIGHTS = torch.tensor([1.0, 10.0, 10.0, 10.0]).cuda()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# from torch.optim import Adam
# FULL_FINETUNING = True 
# if FULL_FINETUNING:
#   param_optimizer = list(model.named_parameters())
#   no_decay = ['bias', 'gamma', 'beta']
#   optimizer_grouped_parameters = [
#     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#       'weight_decay_rate': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#       'weight_decay_rate': 0.0}
#   ]
# else:
#   param_optimizer = list(model.classifier.named_parameters()) 
#   optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
# optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)
# get_score(model, mode='eval')

In [0]:
train(model, train_dataloader, eval_dataloader, epochs=10, save_model=(NUM_ARTICLES >= 150))



Epoch:   0%|          | 0/10 [00:00<?, ?it/s][A[A

KeyboardInterrupt: ignored

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load(os.path.join(model_dir, 'model_2020-02-25 20:05:27.163346.pt'))
# get_score(model, mode="train")
# print()
get_score(model, mode="eval")

2020-02-26 07:44:03,409 - INFO - Checking user submitted file
2020-02-26 07:44:03,425 - INFO - Scoring the submission with precision and recall method
2020-02-26 07:44:03,598 - INFO - Precision=228.062968/434=0.525491	Recall=245.284300/607=0.404093
2020-02-26 07:44:03,599 - INFO - F1=0.456865


In [0]:
text = "hello https://google.co.in bye bye"
url_pattern = r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
text = re.sub(url_pattern, ' ', text)
print(text)

hello   bye bye


In [0]:
import json
import re
with open(os.path.join(home_dir, 'trump_tweets.json')) as f:
  tweet_json = json.load(f)
trump_tweets = []
for tweet in tweet_json:
  tweet_text = tweet['text']
  if len(tweet_text) > 50 and tweet_text[0:2] != "RT":
    url_pattern = r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
    tweet_text = re.sub(url_pattern, ' ', tweet_text)
    trump_tweets.append(tweet_text)

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

test_articles = ["Mini Mike, don’t lick your dirty fingers. Both unsanitary and dangerous to others and yourself!"]
num_tweets = 100
test_articles = trump_tweets[0:num_tweets]
ind = 0
# model = torch.load(os.path.join(model_dir, 'model_370_44_bioe.pt'))

# test_articles = [articles[ind]]
test_spans = [[]] * len(test_articles)

test_dataloader, test_sentences, test_bert_examples = get_data(test_articles, test_spans, indices=np.arange(len(test_articles)))
sps = get_score(model, mode="test")
for i in range(num_tweets):
  print(test_articles[i])
  print('Detected span: ')
  print_spans(test_articles[i], sps[i])
  print('--' * 50)
# print_spans(articles[ind], spans[ind])

101
50
The only people in favor of Mini Mike continuing with his hapless campaign are me and his political consultants, who are getting richer and richer by the day!
Detected span: 

----------------------------------------------------------------------------------------------------
.@FoxNews is working hard pushing the Radical Left, Do Nothing Democrats. They want to be, unlike their competitors, @CNN &amp; MSDNC (Comcast), Fair &amp; Balanced. When will they ever learn. The Radical Left never even gave @FoxNews permission to partake in their low rated debates!
Detected span: 
Democrats. They want to be, unlike their competitors, @CNN &amp; MSDNC (Comcast), Fair &amp; Balanced. When will they ever learn. The Radical Left never even gave @FoxNews permission to partake in their low rated debates

----------------------------------------------------------------------------------------------------
....competitive disadvantage. We should be leading, not following!
Detected span: 

--------

In [0]:
ind = 32
model = torch.load(os.path.join(model_dir, 'model_370_44_bioe.pt'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


test_articles, test_article_ids = read_articles('test-articles')
# test_articles = [articles[ind]]
test_spans = [[]] * len(test_articles)

test_dataloader, test_sentences, test_bert_examples = get_data(test_articles, test_spans, indices=np.arange(len(test_articles)))
sps = get_score(model, mode="test")
# print_spans(articles[ind], sps[0])
# print('--' * 50)
# print_spans(articles[ind], spans[ind])

3185
124


In [0]:
from google.colab import files
with open('predictions/dev_predictions.txt', 'w') as fp:
  for index in range(len(test_articles)):
    for ii in sps[index]:
      fp.write(test_article_ids[index] + "\t" + str(ii[0]) + "\t" + str(ii[1]) + "\n")
files.download('predictions/dev_predictions.txt')

In [0]:
# torch.save(model, os.path.join(model_dir, 'model_300_42.pt'))


In [0]:
for index in eval_indices:
  print("article index:",index)
  span = merged_predicted_spans[index]
  print_spans(articles[index], span)
  print()
  print_spans(articles[index], spans[index])
  print("--" * 50)

article index: 107
“sort of in a panic”
that someone might “break into his apartment” looking for it, like “that Watergate crap.”
it,”
black-owned businesses.
institutions
“spiritual advisor”
Reading Obama’s 1995 memoir, you might almost get the impression that after a prudent first term, during his second he might side with, I dunno, Black Lives Matter and encourage a wave of black rage and police retreat that drove up the death toll from murder by 20% in his last two years in office, an incremental death toll a little bigger than the U.S. combat death toll from the equally stupid Iraq War.


spiritual advisor
sort of in a panic
black
shaking down white institutions
Reading Obama’s 1995 memoir, you might almost get the impression that after a prudent first term, during his second he might side with, I dunno, Black Lives Matter and encourage a wave of black rage and police retreat that drove up the death toll from murder by 20% in his last two years in office, an incremental death toll

In [0]:
import spacy

In [0]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text,  token.pos_)

Apple PROPN
is VERB
looking VERB
at ADP
buying VERB
U.K. PROPN
startup NOUN
for ADP
$ SYM
1 NUM
billion NUM


#GPT-2

In [0]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2PreTrainedModel
import torch
import torch.nn as nn

In [0]:
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# text = "What is the fastest runningggggggggg car in the"
# indexed_tokens = tokenizer.encode(text)
# print(indexed_tokens)
# print(tokenizer.decode([2491]))
# tokens_tensor = torch.tensor([indexed_tokens])
# print(tokens_tensor)

In [0]:
class GPT2ForTokenClassification(GPT2PreTrainedModel):
  def __init__(self,config):
    super().__init__(config)
    self.num_labels = config.num_labels

    self.gpt2 = GPT2Model(config)
    self.dropout = nn.Dropout(config.resid_pdrop)
    self.classifier = nn.Linear(config.n_embd, config.num_labels)
    self.init_weights()
  
  def forward(self,input_ids=None,
              attention_mask=None,
              token_type_ids = None,
              position_ids = None,
              head_mask = None,
              input_embeds = None,
              labels = None,
              ):
    output = self.gpt2(
        input_ids,
        attention_mask = attention_mask,
        token_type_ids = token_type_ids,
        position_ids = position_ids,
        head_mask = head_mask,
        inputs_embeds = inputs_embeds, 
    )

    sequence_output = outputs[0]
    sequence_output = self.dropout(sequence_output)
    logits = self.classifier(sequence_output)
    outputs = (logits,) + output[2:]
    if labels is not None:
      loss_fct =  CrossEntropyLoss()

      if attention_mask is not None:
        active_loss = attention_mask.view(-1)==1
        active_logits = logits.view(-1, self.num_labels)[active_loss]
        active_labels = labels.view(-1)[active_loss]
        loss = loss_fct(active_logits, active_labels)
      else:
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
      outputs = (loss,) + outputs
    
    return outputs
        
    


NameError: ignored

In [0]:
model = GPT2ForTokenClassification.from_pretrained('gpt2', num_labels=2)

In [0]:
model = model.cuda()
print(model)

GPT2ForTokenClassification(
  (gpt2): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): 

In [0]:
gpt2tokenizer = GPT2Tokenizer.from_pretrained('gpt2')