In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!pip install transformers

In [0]:
import os
import sys
import torch
import csv
import numpy as np
import random
import time
import datetime
import pprint


In [0]:
home_dir = "gdrive/My Drive/propaganda_detection"
data_dir = os.path.join(home_dir, "datasets")
model_dir = os.path.join(home_dir, "model_dir")
if not os.path.isdir(model_dir):
  os.mkdir(model_dir)

In [0]:
# Read training articles
def read_articles(dir_name):
  articles = []
  train_dir = os.path.join(data_dir, dir_name)
  for filename in sorted(os.listdir(train_dir)):
    myfile = open(os.path.join(train_dir, filename))
    article = myfile.read()
    articles.append(article)
    myfile.close()
  article_ids = []
  for filename in sorted(os.listdir(train_dir)):
    article_ids.append(int(filename[7:-4]))
  return articles, article_ids

In [0]:
# Read training span labels 
def read_spans(mode=None):
  spans = []
  techniques = []
  if mode == "test":
    label_dir = os.path.join(data_dir, "dev-task-TC-template.out")
  else:
    label_dir = os.path.join(data_dir, "train-labels-task2-technique-classification")
  for filename in sorted(os.listdir(label_dir)):
    myfile = open(os.path.join(label_dir, filename))
    tsvreader = csv.reader(myfile, delimiter="\t")
    span = []
    technique = []
    for row in tsvreader:
      span.append((int(row[2]), int(row[3])))
      if mode == "test":
        technique.append("Slogans") # DUMMY
      else:
        technique.append(row[1])
    myfile.close()
    spans.append(span)
    techniques.append(technique)
  return spans, techniques

In [0]:
# Read training span labels 
def read_test_spans(mode=None):
  spans = []
  techniques = []
  indices = []
  if mode == 'test':
    label_file = os.path.join(data_dir, "test-TC/test-task-TC-template.out")
  else:  
    label_file = os.path.join(data_dir, "dev-task-TC-template.out")
  myfile = open(label_file)
  prev_index = -1
  tsvreader = csv.reader(myfile, delimiter="\t")

  span = []
  technique = []
  for row in tsvreader:
    article_index = int(row[0])
    if article_index != prev_index:
      if prev_index != -1:
        spans.append(span)
        techniques.append(technique)
      span = []
      technique = []
      span.append((int(row[2]), int(row[3])))
      technique.append("Slogans")
      indices.append(article_index)
      prev_index = article_index
    else:
      span.append((int(row[2]), int(row[3])))
      technique.append("Slogans")
  spans.append(span)
  techniques.append(technique)
  indices.append(article_index)
  if mode == 'test':
    return spans, techniques, indices
  return spans, techniques

In [0]:
def print_spans(article, span, technique):
  for index, sp in enumerate(span):
    print(technique[index], tag2idx[technique[index]], end=' - ')
    print (article[sp[0]: sp[1]])
  print()

In [0]:
def get_examples(articles, spans, techniques):
  assert len(articles) == len(spans) and len(spans) == len(techniques)
  sentences = []
  labels = []
  for index, article in enumerate(articles):
    span = spans[index]
    technique = techniques[index]
    assert len(technique) == len(span)
    for i, sp in enumerate(span):
      pt = tag2idx[technique[i]]
      sentence = article[sp[0]: sp[1]]
      sentences.append(sentence)
      labels.append(pt)
  return sentences, labels

In [0]:
from transformers import BertForTokenClassification
from keras.preprocessing.sequence import pad_sequences
def convert_sentence_to_input_feature(sentence, tokenizer, add_cls_sep=True, max_seq_len=150):
  tokenized_sentence = tokenizer.encode_plus(sentence,
                                             add_special_tokens=add_cls_sep,
                                             max_length=max_seq_len,
                                             pad_to_max_length=True,
                                             return_attention_mask=True)
  return tokenized_sentence['input_ids'], tokenized_sentence['attention_mask']

Using TensorFlow backend.


In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.preprocessing import normalize

def get_data(articles, spans, techniques):
  sentences, labels = get_examples(articles, spans, techniques)
  attention_masks = []
  inputs = []
  lengths = []
  for i, sentence in enumerate(sentences):
    lengths.append(len(sentence) / 100) # divide by 100 for normalization
    input_ids, mask = convert_sentence_to_input_feature(sentence, tokenizer)
    inputs.append(input_ids)
    attention_masks.append(mask)
  
  inputs = torch.tensor(inputs)
  labels = torch.tensor(labels)
  masks = torch.tensor(attention_masks)
  lengths = torch.tensor(lengths).float()
  tensor_data = TensorDataset(inputs, labels, masks, lengths)
  dataloader = DataLoader(tensor_data, batch_size=BATCH_SIZE)
  return dataloader

In [0]:
from sklearn import metrics

def compute_metrics(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  print(metrics.confusion_matrix(labels_flat, pred_flat))
  print(metrics.classification_report(labels_flat, pred_flat))

def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
from transformers import RobertaConfig, RobertaModel, BertPreTrainedModel
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss

ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
    "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
    "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
}

class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class CustomRobertaForSequenceClassification(BertPreTrainedModel):
    config_class = RobertaConfig
    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "roberta"

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        self.length_classifier = nn.Linear(config.num_labels+1, config.num_labels)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        lengths=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        lengths = lengths.unsqueeze(1)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        if INCLUDE_LENGTH_FEATURE:
          logits = torch.cat((logits, lengths), axis=1)
          logits = self.length_classifier(logits)
        outputs = (logits,) + outputs[2:]
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)



In [0]:
def train(model, epochs=5):
  loss_values = []
  for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
      if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
      b_input_ids = batch[0].to(device)
      b_labels = batch[1].to(device)
      b_input_mask = batch[2].to(device)
      b_lengths = batch[3].to(device)
      model.zero_grad()        
      outputs = model(b_input_ids, 
                      token_type_ids=None, 
                      attention_mask=b_input_mask,
                      lengths=b_lengths,
                      labels=b_labels)
      loss = outputs[0]
      total_loss += loss.item()
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      optimizer.step()
      scheduler.step() # TODO
    avg_train_loss = total_loss / len(train_dataloader)            
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in eval_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_labels, b_input_mask, b_lengths = batch
      with torch.no_grad():        
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask,
                        lengths=b_lengths)
      
      logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      eval_accuracy += tmp_eval_accuracy
      nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
  print("")
  print("Training complete!")

In [0]:
def get_model_predictions(model, dataloader):
  model.eval()
  predictions , true_labels = [], []
  nb_eval_steps = 0
  for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_labels, b_input_mask, b_lengths = batch  
    with torch.no_grad():
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, lengths=b_lengths)
    logits = logits[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    pred_label = np.argmax(logits, axis=1)
    predictions.extend(pred_label)
    true_labels.extend(label_ids)
  return predictions, true_labels

In [0]:
from google.colab import files

def get_dev_predictions(model):
  test_articles, _ = read_articles("dev-articles")
  test_spans, test_techniques = read_test_spans()

  test_articles = test_articles[1:]
  test_dataloader = get_data(test_articles, test_spans, test_techniques)
  pred, _ = get_model_predictions(model, test_dataloader)

  with open('predictions.txt', 'w') as fp:
    label_file = os.path.join(data_dir, "dev-task-TC-template.out")
    myfile = open(label_file)
    prev_index = -1
    tsvreader = csv.reader(myfile, delimiter="\t")
    for i, row in enumerate(tsvreader):
      fp.write(row[0] + '\t' + distinct_techniques[pred[i]] + '\t' + row[2] + '\t' + row[3] + '\n')
  files.download('predictions.txt')


In [0]:
from google.colab import files

def get_test_predictions():
  temp_test_articles, test_indices = read_articles("test-TC/test-articles")
  test_spans, test_techniques, span_indices = read_test_spans(mode="test")
  test_articles = []
  span_indices = set(span_indices)
  for index, article in enumerate(temp_test_articles):
    if test_indices[index] in span_indices:
      test_articles.append(article)
  print(len(test_articles))
  print(len(test_spans))
  test_dataloader = get_data(test_articles, test_spans, test_techniques)
  pred, _ = get_model_predictions(model, test_dataloader)

  with open('predictions.txt', 'w') as fp:
    label_file = os.path.join(data_dir, "test-TC/test-task-TC-template.out")
    myfile = open(label_file)
    prev_index = -1
    tsvreader = csv.reader(myfile, delimiter="\t")
    for i, row in enumerate(tsvreader):
      fp.write(row[0] + '\t' + distinct_techniques[pred[i]] + '\t' + row[2] + '\t' + row[3] + '\n')
  files.download('predictions.txt')

# get_test_predictions()


In [0]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import RobertaTokenizer

articles, article_ids = read_articles("train-articles")
spans, techniques = read_spans()
distinct_techniques = list(set([y for x in techniques for y in x])) # idx to tag
tag2idx = {t: i for i, t in enumerate(distinct_techniques)}
pprint.pprint(tag2idx)

NUM_ARTICLES = len(articles)

articles = articles[0:NUM_ARTICLES]
spans = spans[0:NUM_ARTICLES]
techniques = techniques[0:NUM_ARTICLES]
BATCH_SIZE=8

seed_val = 1328 # 32
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
indices = np.arange(NUM_ARTICLES)
train_articles, eval_articles, train_spans, eval_spans, train_techniques, eval_techniques, train_indices, eval_indices = train_test_split(articles, spans, techniques, indices, test_size=0.1)

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', lower_case=True)

train_dataloader = get_data(train_articles, train_spans, train_techniques)
eval_dataloader = get_data(eval_articles, eval_spans, eval_techniques)

{'Appeal_to_Authority': 1,
 'Appeal_to_fear-prejudice': 5,
 'Bandwagon,Reductio_ad_hitlerum': 4,
 'Black-and-White_Fallacy': 2,
 'Causal_Oversimplification': 13,
 'Doubt': 9,
 'Exaggeration,Minimisation': 12,
 'Flag-Waving': 10,
 'Loaded_Language': 0,
 'Name_Calling,Labeling': 11,
 'Repetition': 8,
 'Slogans': 6,
 'Thought-terminating_Cliches': 7,
 'Whataboutism,Straw_Men,Red_Herring': 3}


In [0]:
# from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
# from transformers import RobertaForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INCLUDE_LENGTH_FEATURE = False # Whether to include length as feature

model = CustomRobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels = len(distinct_techniques),
    output_attentions = False, 
    output_hidden_states = False,
)
model.cuda()

optimizer = AdamW(model.parameters(),lr = 3e-5,eps = 1e-8) # ler = 5e-5
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
train(model, epochs=epochs)


Training...
  Batch   100  of    691.    Elapsed: 0:00:17.
  Batch   200  of    691.    Elapsed: 0:00:34.
  Batch   300  of    691.    Elapsed: 0:00:51.
  Batch   400  of    691.    Elapsed: 0:01:08.
  Batch   500  of    691.    Elapsed: 0:01:25.
  Batch   600  of    691.    Elapsed: 0:01:42.

  Average training loss: 1.69
  Training epcoh took: 0:01:58

Running Validation...
  Accuracy: 0.65
  Validation took: 0:00:03

Training...
  Batch   100  of    691.    Elapsed: 0:00:17.
  Batch   200  of    691.    Elapsed: 0:00:34.
  Batch   300  of    691.    Elapsed: 0:00:51.
  Batch   400  of    691.    Elapsed: 0:01:08.
  Batch   500  of    691.    Elapsed: 0:01:25.
  Batch   600  of    691.    Elapsed: 0:01:42.

  Average training loss: 1.16
  Training epcoh took: 0:01:58

Running Validation...
  Accuracy: 0.70
  Validation took: 0:00:03

Training...
  Batch   100  of    691.    Elapsed: 0:00:17.
  Batch   200  of    691.    Elapsed: 0:00:34.
  Batch   300  of    691.    Elapsed: 0:00:51

In [0]:
get_dev_predictions(model)

In [0]:
files.download('predictions.txt')