In [None]:
from sklearn.linear_model import LogisticRegression
import glob
import os.path
import numpy as np
import sys
import codecs
import functools
from collections import Counter

In [None]:
!pip install transformers
!pip install tensorboardx
!pip install simpletransformers

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from simpletransformers.classification import ClassificationModel

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
source = '/content/drive/My Drive/Colab Notebooks/Propaganda challenge'

train_folder = source + '/' + "datasets/train-articles" # check that the path to the datasets folder is correct, 
dev_folder = source + '/' + "datasets/dev-articles"     # if not adjust these variables accordingly
test_folder = source + '/' + "datasets/test-articles"
train_labels_file = source + '/' + "datasets/train-task2-TC.labels"
dev_template_labels_file = source + '/' + "datasets/dev-task-TC-template.out"
test_template_labels_file = source + '/' + "datasets/test-task-TC-template.out"
task_TC_output_file = "result.txt"

In [None]:
def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    """
    Read articles from files matching patterns <file_pattern> from  
    the directory <folder_name>. 
    The content of the article is saved in the dictionary whose key
    is the id of the article (extracted from the file name).
    Each element of <sentence_list> is one line of the article.
    """
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read()
    return articles

In [None]:
def read_predictions_from_file(filename):
    """
    Reader for the gold file and the template output file. 
    Return values are four arrays with article ids, labels 
    (or ? in the case of a template file), begin of a fragment, 
    end of a fragment. 
    """
    articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
            articles_id.append(article_id)
            gold_labels.append(gold_label)
            span_starts.append(span_start)
            span_ends.append(span_end)
    return articles_id, span_starts, span_ends, gold_labels


In [None]:
# loading articles' content from *.txt files in the train folder
articles = read_articles_from_file_list(train_folder)

In [None]:
len(articles)

In [None]:
# loading gold labels, articles ids and sentence ids from files *.task-TC.labels in the train labels folder 
ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels = read_predictions_from_file(train_labels_file)
print("Loaded %d annotations from %d articles" % (len(ref_span_starts), len(set(ref_articles_id))))

In [None]:
# reading data from the development set
dev_articles = read_articles_from_file_list(dev_folder)
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)

In [None]:
test_articles = read_articles_from_file_list(test_folder)
test_article_ids, test_span_starts, test_span_ends, test_labels = read_predictions_from_file(test_template_labels_file)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
le.fit(train_gold_labels)

In [None]:
labels = le.transform(train_gold_labels)

In [None]:
le.classes_

In [None]:
c = Counter(train_gold_labels)
c

In [None]:
def get_spans(articles, articles_ids, span_starts, span_ends):
    result = []
    for article_id, start, end in zip(articles_ids, span_starts, span_ends):
        article = articles[article_id]
        #print(int(start), int(end))
        span = article[int(start): int(end)]
        result.append(span)

    return result

In [None]:
train_spans = get_spans(articles, ref_articles_id, ref_span_starts, ref_span_ends)

In [None]:
dev_spans = get_spans(dev_articles, dev_article_ids, dev_span_starts, dev_span_ends)

In [None]:
test_spans = get_spans(test_articles, test_article_ids, test_span_starts, test_span_ends)

In [None]:
train_arr = [[text, label] for text, label in zip(train_spans, labels)]

In [None]:
train_df = pd.DataFrame(data=train_arr)

In [None]:
y = train_df.iloc[:, 1]

# Transformer training

weighted training

In [None]:
c = Counter(labels)

In [None]:
weights = [sum(c.values()) / c[i] for i in range(14)]

In [None]:
weights = [x / sum(weights) for x in weights]

In [None]:
weights

In [None]:
model = ClassificationModel('roberta', 'roberta-base', weight=weights, num_labels=14, args=({'fp16': False, 'overwrite_output_dir': True, 'max_seq_length': 512, 'num_train_epochs': 5}))

In [None]:
model.train_model(train_df)

In [None]:
preds, raw_outputs = model.predict(test_spans)

In [None]:
predictions = le.inverse_transform(preds)

In [None]:
# writing predictions to file
with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(dev_article_ids, predictions, dev_span_starts, dev_span_ends):
        fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))
print("Predictions written to file " + task_TC_output_file)

In [None]:
with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(test_article_ids, predictions, test_span_starts, test_span_ends):
        fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))
print("Predictions written to file " + task_TC_output_file)

In [None]:
predictions

# Undersampling

In [None]:
train_df = pd.DataFrame(data=train_arr)

In [None]:
cat1 = train_df[train_df[1] == 8].index
cat2 = train_df[train_df[1] == 9].index

In [None]:
from random import sample

In [None]:
cat1_drop = sample(list(cat1), int(len(cat1) * 0.5))

In [None]:
cat2_drop = sample(list(cat2), int(len(cat2) * 0.2))

In [None]:
train_df = train_df.drop(cat1_drop)

In [None]:
train_df = train_df.drop(cat2_drop)

In [None]:
len(train_df)

In [None]:
from simpletransformers.classification import ClassificationModel

In [None]:
model = ClassificationModel('roberta', 'roberta-base', num_labels=14, args=({'fp16': False, 'overwrite_output_dir': True, 'max_seq_length': 512, 'num_train_epochs': 5}))

In [None]:
model.train_model(train_df)

In [None]:
preds, raw_outputs = model.predict(dev_spans)

In [None]:
predictions = le.inverse_transform(preds)

In [None]:
# writing predictions to file
with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(dev_article_ids, predictions, dev_span_starts, dev_span_ends):
        fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))
print("Predictions written to file " + task_TC_output_file)

# Context 

In [None]:
c = Counter(train_gold_labels)
c

In [None]:
def get_spans_with_context(articles, articles_ids, span_starts, span_ends):
    result = []
    a = {'.', '?', '!', '\n', ','}

    for article_id, start, end in zip(articles_ids, span_starts, span_ends):
        article = articles[article_id]
        occurences = [article.find(i, int(end)) for i in a]
        final = min([x for x in occurences if x != -1])

        span = article[int(start): int(final)]
        result.append(span)
    
    return result

In [None]:
train_spans = get_spans_with_context(articles, ref_articles_id, ref_span_starts, ref_span_ends)

In [None]:
dev_spans = get_spans_with_context(dev_articles, dev_article_ids, dev_span_starts, dev_span_ends)

In [None]:
train_arr = [[text, label] for text, label in zip(train_spans, labels)]

In [None]:
train_df = pd.DataFrame(data=train_arr)

In [None]:
model = ClassificationModel('roberta', 'roberta-base', num_labels=14, args=({'fp16': False, 'overwrite_output_dir': True, 'max_seq_length': 512, 'num_train_epochs': 5}))

In [None]:
model.train_model(train_df)

In [None]:
preds, raw_outputs = model.predict(dev_spans)

In [None]:
predictions = le.inverse_transform(preds)

In [None]:
# writing predictions to file
with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(dev_article_ids, predictions, dev_span_starts, dev_span_ends):
        fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))
print("Predictions written to file " + task_TC_output_file)

# EDA

In [None]:
def get_len(row):
    return len(row.split())

In [None]:
train_df['size'] = train_df[0].apply(lambda x: get_len(x))

In [None]:
def plot_len(train_df):
    fig, axes = plt.subplots(4, 4, figsize=(12, 12))

    for i in range(14):
        ax = axes.ravel()[i]
        cat = train_df[train_df[1] == i]

        title = le.inverse_transform([i])[0]
        #ax.title.set_text(title)
        ax.set_title(title, fontsize=10)

        ax.hist(cat['size'])

In [None]:
def plot_boxes(train_df):
    data = []
    labels = []

    fig, ax = plt.subplots(figsize=(10, 10))

    for i in range(14):
        cat = train_df[train_df[1] == i]
        pl = ax.boxplot(list(cat['size']), positions=[i], boxprops=dict(color="C" + str(i)))
        data.append(pl["boxes"][0])
        labels.append(str(i) + ". " + le.inverse_transform([i])[0])
  
    #ax7.set_title('Box plots of length distribution in words inside classes')

    ax.legend(data, labels, loc='upper right')

    fig.savefig('boxes.png')
    plt.show()

In [None]:
plot_boxes(train_df)

In [None]:
plot_len(train_df=train_df)