In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!pip install transformers

In [0]:
import os
import sys
import torch
import csv
import numpy as np
import random
import time
import datetime
import pprint


In [0]:
home_dir = "gdrive/My Drive/propaganda_detection"
data_dir = os.path.join(home_dir, "datasets")
model_dir = os.path.join(home_dir, "model_dir")
if not os.path.isdir(model_dir):
  os.mkdir(model_dir)

In [0]:
# Read training articles
def read_articles(dir_name):
  articles = []
  train_dir = os.path.join(data_dir, dir_name)
  for filename in sorted(os.listdir(train_dir)):
    myfile = open(os.path.join(train_dir, filename))
    article = myfile.read()
    articles.append(article)
    myfile.close()
  article_ids = []
  for filename in sorted(os.listdir(train_dir)):
    article_ids.append(filename[7:-4])
  
  return articles, article_ids

In [0]:
def get_titles(articles):
  titles = []
  for article in articles:
    title = article.split('\n')[0]
    titles.append(title)
  return titles

In [0]:
# Read training span labels 
def read_spans(mode=None):
  spans = []
  techniques = []
  if mode == "test":
    label_dir = os.path.join(data_dir, "dev-task-TC-template.out")
  else:
    label_dir = os.path.join(data_dir, "train-labels-task2-technique-classification")
  for filename in sorted(os.listdir(label_dir)):
    myfile = open(os.path.join(label_dir, filename))
    tsvreader = csv.reader(myfile, delimiter="\t")
    span = []
    technique = []
    for row in tsvreader:
      span.append((int(row[2]), int(row[3])))
      if mode == "test":
        technique.append("Slogans") # DUMMY
      else:
        technique.append(row[1])
    myfile.close()
    spans.append(span)
    techniques.append(technique)
  return spans, techniques

In [0]:
# Read training span labels 
def read_test_spans():
  spans = []
  techniques = []
  label_file = os.path.join(data_dir, "dev-task-TC-template.out")
  myfile = open(label_file)
  prev_index = -1
  tsvreader = csv.reader(myfile, delimiter="\t")

  span = []
  technique = []
  for row in tsvreader:
    article_index = int(row[0])
    if article_index != prev_index:
      if prev_index != -1:
        spans.append(span)
        techniques.append(technique)
      span = []
      technique = []
      span.append((int(row[2]), int(row[3])))
      technique.append("Slogans")
      prev_index = article_index
    else:
      span.append((int(row[2]), int(row[3])))
      technique.append("Slogans")
  spans.append(span)
  techniques.append(technique)
  return spans, techniques, 

In [0]:
def print_spans(article, span, technique):
  for index, sp in enumerate(span):
    print(technique[index], tag2idx[technique[index]], end=' - ')
    print (article[sp[0]: sp[1]])
  print()

In [0]:
def get_context(article, span, mode=None):
  def get_num_words(sentence):
    return len(sentence.split(' '))
  if mode == "title":
    return article.split('\n')[0]
  if mode == "sentence":
    WORD_LEN_LIMIT = 120
    li = span[0]
    ri = span[1]
    span_text = article[li: ri]
    num_words = get_num_words(span_text)
    if num_words >= WORD_LEN_LIMIT:
      return span_text
    remaining_len = WORD_LEN_LIMIT - num_words
    lhs_words = remaining_len // 2
    rhs_words = remaining_len - lhs_words
    li -= 1
    lcount = 0
    while li >= 0 and article[li] != '\n' and lcount < lhs_words:
      if article[li] == ' ':
        lcount += 1
      li -= 1
    ri += 1
    rcount = 0
    while ri < len(article) and article[ri] != '\n' and rcount < rhs_words:
      if article[ri] == ' ':
        rcount += 1
      ri += 1
    return article[li+1: ri - 1] 

  return ""

In [0]:
# Change max_seq_len as well in convert_sentence_to_input_feature function
# Set max seq length as 150 when using Title Context
def get_examples(articles, spans, techniques, context_mode="sentence"):
  assert len(articles) == len(spans) and len(spans) == len(techniques)
  sentences = []
  labels = []
  sent_contexts = []
  for index, article in enumerate(articles):
    span = spans[index]
    technique = techniques[index]
    assert len(technique) == len(span)
    for i, sp in enumerate(span):
      pt = tag2idx[technique[i]]
      sentence = article[sp[0]: sp[1]]
      sentences.append(sentence)
      labels.append(pt)
      context = get_context(article, sp, context_mode)
      sent_contexts.append(context)
  return sentences, labels, sent_contexts

In [0]:
from transformers import BertForTokenClassification
from keras.preprocessing.sequence import pad_sequences
def convert_sentence_to_input_feature(sentence, tokenizer, add_cls_sep=True, max_seq_len=256, title=None):
  tokenized_sentence = tokenizer.encode_plus(sentence,
                                             text_pair=title,
                                             add_special_tokens=add_cls_sep,
                                             max_length=max_seq_len,
                                             pad_to_max_length=True,
                                             return_attention_mask=True)
  attention_mask = tokenized_sentence['attention_mask']
  input_ids = tokenized_sentence['input_ids']
  token_type_ids = [0] * max_seq_len
  li = np.sum(attention_mask)
  si = 0
  for i in range(max_seq_len):
    if input_ids[i] == tokenizer.sep_token_id:
      si = i+1
      break
  for i in range(li - si):
    token_type_ids[i+si] = 1 

  return input_ids, attention_mask, token_type_ids  

In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def get_data(articles, spans, techniques):
  sentences, labels, titles = get_examples(articles, spans, techniques)
  attention_masks = []
  inputs = []
  token_type_ids = []
  for i, sentence in enumerate(sentences):
    input_ids, mask, segment_ids = convert_sentence_to_input_feature(sentence, tokenizer, title=titles[i])
    inputs.append(input_ids)
    attention_masks.append(mask)
    token_type_ids.append(segment_ids)
  inputs = torch.tensor(inputs)
  labels = torch.tensor(labels)
  masks = torch.tensor(attention_masks)
  token_type_ids = torch.tensor(token_type_ids)
  tensor_data = TensorDataset(inputs, labels, masks, token_type_ids)
  dataloader = DataLoader(tensor_data, batch_size=BATCH_SIZE)
  return dataloader

In [0]:
from sklearn import metrics

def compute_metrics(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  print(metrics.confusion_matrix(labels_flat, pred_flat))
  print(metrics.classification_report(labels_flat, pred_flat))

def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
def train(model, epochs=5):
  loss_values = []
  for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
      if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
      b_input_ids = batch[0].to(device)
      b_labels = batch[1].to(device)
      b_input_mask = batch[2].to(device)
      b_token_type_ids = batch[3].to(device)
      model.zero_grad()        
      outputs = model(b_input_ids, 
                      # token_type_ids=b_token_type_ids, 
                      attention_mask=b_input_mask, 
                      labels=b_labels)
      loss = outputs[0]

      total_loss += loss.item()

      loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      optimizer.step()

      scheduler.step() # TODO
    avg_train_loss = total_loss / len(train_dataloader)            
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
    print("")
    print("Running Validation...")
    t0 = time.time()
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_labels, b_input_mask, b_token_type_ids = batch
      with torch.no_grad():        
        outputs = model(b_input_ids, 
                        # token_type_ids=b_token_type_ids, 
                        attention_mask=b_input_mask)
      
      logits = outputs[0]

      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      
      tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      
      eval_accuracy += tmp_eval_accuracy

      nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
  print("")
  print("Training complete!")

In [0]:
def get_model_predictions(model, dataloader):
  model.eval()
  predictions , true_labels = [], []
  nb_eval_steps = 0
  for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_labels, b_input_mask, b_token_type_ids = batch  
    with torch.no_grad():
      logits = model(b_input_ids, attention_mask=b_input_mask)
      # logits = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
    logits = logits[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    pred_label = np.argmax(logits, axis=1)
    predictions.extend(pred_label)
    true_labels.extend(label_ids)
  return predictions, true_labels

In [0]:
from google.colab import files

def get_dev_predictions(model):
  test_articles, _ = read_articles("dev-articles")
  test_spans, test_techniques = read_test_spans()

  test_articles = test_articles[1:]
  test_dataloader = get_data(test_articles, test_spans, test_techniques)
  pred, _ = get_model_predictions(model, test_dataloader)

  with open('predictions.txt', 'w') as fp:
    label_file = os.path.join(data_dir, "dev-task-TC-template.out")
    myfile = open(label_file)
    prev_index = -1
    tsvreader = csv.reader(myfile, delimiter="\t")
    for i, row in enumerate(tsvreader):
      fp.write(row[0] + '\t' + distinct_techniques[pred[i]] + '\t' + row[2] + '\t' + row[3] + '\n')

In [0]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import RobertaTokenizer

articles, article_ids = read_articles("train-articles")
spans, techniques = read_spans()
distinct_techniques = list(set([y for x in techniques for y in x])) # idx to tag
tag2idx = {t: i for i, t in enumerate(distinct_techniques)}
pprint.pprint(tag2idx)
NUM_ARTICLES = len(articles)

articles = articles[0:NUM_ARTICLES]
spans = spans[0:NUM_ARTICLES]
techniques = techniques[0:NUM_ARTICLES]
BATCH_SIZE=8

# seed_val = 32
seed_val = 1328
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

indices = np.arange(NUM_ARTICLES)

train_articles, eval_articles, train_spans, eval_spans, train_techniques, eval_techniques, train_indices, eval_indices = train_test_split(articles, spans, techniques, indices, test_size=0.1)

#### Change context mode in get_examples function ####
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', lower_case=True)


train_dataloader = get_data(train_articles, train_spans, train_techniques)
eval_dataloader = get_data(eval_articles, eval_spans, eval_techniques)

{'Appeal_to_Authority': 10,
 'Appeal_to_fear-prejudice': 13,
 'Bandwagon,Reductio_ad_hitlerum': 5,
 'Black-and-White_Fallacy': 2,
 'Causal_Oversimplification': 8,
 'Doubt': 7,
 'Exaggeration,Minimisation': 4,
 'Flag-Waving': 3,
 'Loaded_Language': 6,
 'Name_Calling,Labeling': 0,
 'Repetition': 11,
 'Slogans': 1,
 'Thought-terminating_Cliches': 9,
 'Whataboutism,Straw_Men,Red_Herring': 12}


In [0]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import RobertaForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels = len(distinct_techniques),
    output_attentions = False, 
    output_hidden_states = False,
)
model.cuda()

optimizer = AdamW(model.parameters(),lr = 3e-5,eps = 1e-8) # ler = 5e-5
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
train(model, epochs=epochs)


Training...
  Batch   100  of    691.    Elapsed: 0:01:12.
  Batch   200  of    691.    Elapsed: 0:02:25.
  Batch   300  of    691.    Elapsed: 0:03:37.
  Batch   400  of    691.    Elapsed: 0:04:50.
  Batch   500  of    691.    Elapsed: 0:06:02.
  Batch   600  of    691.    Elapsed: 0:07:15.

  Average training loss: 1.95
  Training epcoh took: 0:08:20

Running Validation...
  Accuracy: 0.46
  Validation took: 0:00:17

Training...
  Batch   100  of    691.    Elapsed: 0:01:13.
  Batch   200  of    691.    Elapsed: 0:02:25.
  Batch   400  of    691.    Elapsed: 0:04:50.
  Batch   500  of    691.    Elapsed: 0:06:03.
  Batch   600  of    691.    Elapsed: 0:07:16.

  Average training loss: 1.63
  Training epcoh took: 0:08:21

Running Validation...
  Accuracy: 0.64
  Validation took: 0:00:17

Training...
  Batch   100  of    691.    Elapsed: 0:01:13.
  Batch   200  of    691.    Elapsed: 0:02:25.
  Batch   300  of    691.    Elapsed: 0:03:38.
  Batch   500  of    691.    Elapsed: 0:06:03

In [0]:
get_dev_predictions(model)

In [0]:
files.download('predictions.txt')