# Download dataset

In [None]:
# ! unrar x articles.rar
# ! unrar x labels.rar

# Imports

In [25]:
import matplotlib.pyplot as plt
import numpy as np
import json
from collections import Counter
from pandas import DataFrame
from transformers import AutoTokenizer
import os

# Read dataset and store them in variables

In [87]:
alltags = sorted({'Loaded_Language', 'Obfuscation-Vagueness-Confusion', 'Appeal_to_Hypocrisy', 'Exaggeration-Minimisation', 'Slogans', 'Repetition', 'Appeal_to_Popularity', 'Conversation_Killer', 'False_Dilemma-No_Choice', 'Causal_Oversimplification', 'Straw_Man', 'Appeal_to_Authority', 'Red_Herring', 'Doubt', 'Guilt_by_Association', 'Flag_Waving', 'Appeal_to_Fear-Prejudice', 'Name_Calling-Labeling', 'Whataboutism'})
validtags = ['Appeal_to_Authority', 'Appeal_to_Fear-Prejudice', 'UNK', 'UNK', 'Causal_Oversimplification', 'Doubt', 'Exaggeration-Minimisation', 'Flag_Waving', 'UNK', 'Loaded_Language', 'Straw_Man', 'Name_Calling-Labeling', 'Obfuscation-Vagueness-Confusion', 'Red_Herring', 'UNK', 'Repetition', 'Slogans', 'UNK', 'UNK', 'Whataboutism']
id2tag = {i: validtags[i] for i in range(len(validtags))}
tag2id = {validtags[i]: i for i in range(len(validtags))}

def one_hot(tags):
  result = np.zeros(len(validtags))

  if tags == ['']:
    return str(result)
  for tag in tags:
    if tag not in validtags:
      return 'INVALID'
  tagids = np.array([tag2id[tag] for tag in tags])
  result[tagids] = 1.0
  return str(result)

In [53]:
files = os.listdir('/content')
articles = []
for f in files:
  if f[:7] == 'article' and f[-3:] == 'txt' and len(f) == 20:
    articles.append(f)

dataset = {'artid': [], 'pid': [], 'paragraph': [], 'label': []}

for article in articles:
  artid = article[:-4][7:]
  with open(article, 'r') as f:
    for i, line in enumerate(f.readlines()):
      paragraph = line.strip()
      if paragraph != '':
        pid = str(i + 1)
        dataset['artid'].append(artid)
        dataset['pid'].append(pid)
        dataset['paragraph'].append(paragraph)
        dataset['label'].append('')

dataset = DataFrame(dataset)
dataset

Unnamed: 0,artid,pid,paragraph,label
0,700551604,1,Will Trump Continue the CIA’s JFK Cover-Up?,
1,700551604,3,"Last Friday, President Trump made the followin...",
2,700551604,4,I have decided not to block release of the CIA...,
3,700551604,5,"Okay, he didn’t really put it like that. But t...",
4,700551604,6,"Subject to the receipt of further information,...",
...,...,...,...,...
9493,737255982,6,"The ""Jewish Question"" is a term generally used...",
9494,737255982,7,Left-wing advocacy group J Street said it is r...,
9495,737255982,8,"In J Street's home dimension of Oceania, suppo...",
9496,737255982,9,"“We take anti-Semitism quite seriously,” J Str...",


In [91]:
labels = []
for f in files:
  if len(f) == 37 and f[:7] == 'article' and f[-21:] == '-labels-subtask-3.txt':
    labels.append(f)

alltags = set()

for label in labels:
  artid = label[7: 16]
  with open(label, 'r') as f:
    for line in f.readlines():
      label_artid, pid, tags = line.split('\t')
      tags = tags.strip().split(',')
      onehot_tags = one_hot(tags)
      assert label_artid == artid
      assert len(dataset.loc[(dataset['artid'] == label_artid) & (dataset['pid'] == pid)]) == 1
      dataset.loc[(dataset['artid'] == label_artid) & (dataset['pid'] == pid), 'label'] = onehot_tags

In [96]:
dataset = dataset.loc[dataset['label'] != 'INVALID']


In [107]:
dataset

artid
pid
paragraph
label


In [108]:
with open('extratrain.tsv', 'w') as f:
  f.write('ID\tText\tLabel\n')
  for index, row in dataset.iterrows():
    id = row['artid'] + '_' + row['pid']
    text = row['paragraph']
    label = row['label']
    f.write(f'{id}\t{text}\t{label}\n')

In [111]:
! rm *.template

# Preprocessing Data

In [None]:
tags = ['Appeal to authority', 'Appeal to fear/prejudice', 'Bandwagon', 'Black-and-white Fallacy/Dictatorship', 'Causal Oversimplification', 'Doubt', 'Exaggeration/Minimisation', 'Flag-waving', 'Glittering generalities (Virtue)', 'Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Name calling/Labeling', 'Obfuscation, Intentional vagueness, Confusion', 'Presenting Irrelevant Data (Red Herring)', 'Reductio ad hitlerum', 'Repetition', 'Slogans', 'Smears', 'Thought-terminating cliché', 'Whataboutism']

ix2tag = {i:tags[i] for i in range(len(tags))}
tag2ix = {tags[i]:i for i in range(len(tags))}

def encode_labels(labels, tags, tag2ix):
  encoded_labels = np.zeros((len(labels), len(tags)))
  for i, label in enumerate(labels):
    for tag in tags:
      if tag in label:
        encoded_labels[i, tag2ix[tag]] = 1
      else:
        encoded_labels[i, tag2ix[tag]] = 0
  return encoded_labels

def all_tags(labels):
  tag_counter = {}
  for label in labels:
    for tag in label:
      tag_counter[tag] = tag_counter.get(tag, 0) + 1

  return list(tag_counter.keys())

In [None]:
def json2tsv_labeled(in_path, out_path):
  with open(in_path, 'r') as f:
    data = json.load(f)

  texts =  [sample['text'] for sample in data]
  labels =  [sample['labels'] for sample in data]
  ids = [sample['id'] for sample in data]

  tags = sorted(all_tags(labels))
  print(tags)
  ix2tag = {i:tags[i] for i in range(len(tags))}
  tag2ix = {tags[i]:i for i in range(len(tags))}
  encoded_labels = encode_labels(labels, tags, tag2ix)

  with open(out_path, 'w') as f:
    f.write('ID\tText\tLabel\n')
    for i in range(len(ids)):
      f.write(f'{ids[i]}\t{texts[i]}\t{encoded_labels[i]}\n')

def json2tsv_unlabeled(in_path, out_path):
  with open(in_path, 'r') as f:
    data = json.load(f)
  texts =  [sample['text'] for sample in data]
  # labels =  [sample['labels'] for sample in data]
  ids = [sample['id'] for sample in data]

  with open(out_path, 'w') as f:
    f.write('ID\tText\n')
    for i in range(len(ids)):
      f.write(f'{ids[i]}\t{texts[i]}\n')


In [None]:
json2tsv_labeled('data/subtask1/train.json', 'data/subtask1/train.tsv')
json2tsv_labeled('data/subtask1/validation.json', 'data/subtask1/validation.tsv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/subtask1/train.json'

In [None]:
json2tsv_unlabeled('data/subtask1/dev_unlabeled.json', 'data/subtask1/dev_unlabeled.tsv')

In [None]:
json2tsv_unlabeled('/content/dev_unlabeled_v2.json', '/content/dev_unlabeled_v2.tsv')