In [8]:
import json
import os
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import pprint

def preprocess_propaganda(article_folder, label_folder, reduce_redundant=True):
    articles = {int(f.split(".")[0].split("article")[-1]): join(article_folder, f) for f in listdir(article_folder) if isfile(join(article_folder, f)) and f.endswith('txt')}
    labels = {int(f.split(".")[0].split("article")[-1]): join(label_folder, f) for f in listdir(label_folder) if isfile(join(label_folder, f))}
    articles = dict(sorted(articles.items()))
    labels = dict(sorted(labels.items()))
    assert len(articles) == len(labels)
    # label_dict = {
    #     'Loaded_Language': 'Loaded Language',
    #     'Name_Calling,Labeling': 'Name Calling or Labeling',
    #     'Exaggeration,Minimisation': 'Exaggeration or Minimisation',
    #     'Doubt': "Attack Credibility (Doubt)", #Questioning the credibility of someone or something.
    #     'Appeal_to_fear-prejudice': 'Appeal to Fear or Prejudice',
    #     'Flag-Waving': 'Flag-Waving',
    #     'Causal_Oversimplification': 'Causal Oversimplification',
    #     'AppealtoAuthority': 'Appeal to False Authority', 
    #     'Appeal_to_Authority': 'Appeal to False Authority',
    #     'Black-and-White_Fallacy': 'False Dilemma (Black-and-White Fallacy)',
    #     'Thought-terminating_Cliches': 'Thought-terminating Cliches',
    #     'Whataboutism': 'Whataboutism',
    #     'Reductio_ad_hitlerum': 'Reductio Ad Hitlerum',
    #     'Red_Herring': 'Red Herring',
    #     'Straw_Men': 'Straw Man',
    #     'Slogans': 'Slogans', 
    #     'Repetition': 'Repetition',
    #     'Bandwagon': 'Ad Populum (Bandwagon Fallacy)',
    #     'Obfuscation,Intentional_Vagueness,Confusion': 'Obfuscation,Intentional_Vagueness,Confusion', #OIVC
    # }
    label_dict = {
        'Name_Calling,Labeling': 'Name-calling',
        'Doubt': "Doubt Credibility", #Questioning the credibility of someone or something.
        'Appeal_to_fear-prejudice': 'Appeal to Fear',
        'Flag-Waving': 'Flag-Waving',
        'Causal_Oversimplification': 'Causal Oversimplification',
        'AppealtoAuthority': 'Appeal to False Authority', 
        'Appeal_to_Authority': 'Appeal to False Authority',
        'Black-and-White_Fallacy': 'False Dilemma',
        'Whataboutism': 'Whataboutism',
        'Reductio_ad_hitlerum': 'Reductio Ad Hitlerum',
        'Red_Herring': 'Red Herring',
        'Straw_Men': 'Straw Man',
        'Bandwagon': 'Ad Populum',
        'Obfuscation,Intentional_Vagueness,Confusion': 'Equivocation', #OIVC
    }
    label_space = list(set(list(label_dict.values())))
    label_ids = list(range(len(label_space)))
    label_id_map = dict(zip(label_space, label_ids))
    print(f"A total of {len(label_space)} labels.")
    print(label_id_map)
    id_label_map = {v:k for k, v in label_id_map.items()}
    examples= []
    
    df_recs = []
    appeared_fal_spans = []
    for aid, article_pth in articles.items():
        label_pth = labels[aid]
        # print(article_pth)
        # print(label_pth)
        sentences = [t for t in open(article_pth, 'r').readlines() if t!="\n"] # read lines into a list
        title = " ".join(sentences[0].strip().split()).replace("\u2019", "'").encode('ascii', errors='ignore').strip().decode('ascii')
        sentences = sentences[1:]
        article_text = open(article_pth).read()
        appeared = []
        for line in open(label_pth, 'r'):
            _, techs, start, end = line.split()
            #gold_ls = list(set([label_dict[g] for g in techs.split(',')]))
            if techs in ['AppealtoAuthority', 'AppealtoEmotion','AdHominem','Slipperyslope', 'FalseCause']:
                print(label_pth)
            if techs in label_dict:  
                gold_ls = [label_dict[techs]]
                start = int(start)
                end = int(end)
                fal_span_ori = article_text[start:(end+1)]
                fal_span = fal_span_ori.strip()
                start = start + fal_span_ori.find(fal_span)
                end = start + len(fal_span) - 1
                for i, s in enumerate(sentences): 
                    start_idx = article_text.find(s) 
                    end_idx = start_idx + len(s) - 1
                    if (fal_span.strip() in s) and (fal_span not in appeared) and (start >= start_idx) and (end <= end_idx):
                        appeared.append(fal_span)
                        pre_t, post_t = [], []
                        if i == 2:
                            pre_t.append(sentences[i-1])
                        elif i > 2:
                            pre_t.extend([sentences[i-2], sentences[i-1]])
                        if i == (len(sentences)-2):
                            post_t.append(sentences[i+1])
                        elif i < (len(sentences)-2):
                            post_t.extend([sentences[i+1], sentences[i+2]])
                        pre_t = [" ".join(t.split()).replace("\u2019", "'").encode('ascii', errors='ignore').strip().decode('ascii') for t in pre_t]
                        post_t = [" ".join(t.split()).replace("\u2019", "'").encode('ascii', errors='ignore').strip().decode('ascii') for t in post_t]
                        
                        fal_span = " ".join(fal_span.split()).replace("\u2019", "'").encode('ascii', errors='ignore').strip().decode('ascii')
                        if len(fal_span.split()[-1]) == 1: #strip out the single letter
                            fal_span = " ".join(fal_span.split()[:-1])
                        fal_t = " ".join(s.split()).replace("\u2019", "'").encode('ascii', errors='ignore').strip().decode('ascii')
                        assert fal_span in fal_t
                        fal_t = fal_t.replace(fal_span, '<'+fal_span+'>') # 0416 added
                        if reduce_redundant:
                            if fal_span not in appeared_fal_spans:
                                appeared_fal_spans.append(fal_span)
                                one_example = {
                                    "id": len(examples),
                                    "title": title,
                                    "pre_text": pre_t,
                                    "fal_span": fal_span,
                                    "text": fal_t,
                                    "post_text": post_t,
                                    "label": gold_ls,
                                }
                                df_recs.append((one_example['id'], one_example['title'], label_id_map[gold_ls[0]]))
                                examples.append(one_example)
                        else:
                            one_example = {
                                "id": len(examples),
                                "title": title,
                                "pre_text": pre_t,
                                "fal_span": fal_span,
                                "text": fal_t,
                                "post_text": post_t,
                                "label": gold_ls,
                            }
                            df_recs.append((one_example['id'], one_example['title'], label_id_map[gold_ls[0]]))
                            examples.append(one_example)
                        break
            #     break
            # break

    df = pd.DataFrame(df_recs, columns=['eid','title', 'label_for_split'])
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    for train_ids, test_dev_ids in skf.split(np.zeros(len(df.index)), df['label_for_split']):
        print("train")
        print(df.iloc[train_ids]['label_for_split'].value_counts())
        train_ids = df.iloc[train_ids]['eid'].tolist()
        df_test_dev = df.iloc[test_dev_ids].reset_index(drop=True)
        break
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    for dev_ids, test_ids in skf.split(np.zeros(len(df_test_dev.index)), df_test_dev['label_for_split']):
        print("dev")
        print(df_test_dev.iloc[dev_ids]['label_for_split'].value_counts())
        print("test")
        test_class_dist = {id_label_map[k]: v for k, v in dict(df_test_dev.iloc[test_ids]['label_for_split'].value_counts()).items()}
        print(pprint.pformat(test_class_dist))
        dev_ids = df_test_dev.iloc[dev_ids]['eid'].tolist()
        test_ids = df_test_dev.iloc[test_ids]['eid'].tolist()
        break
    
    assert len(train_ids) + len(dev_ids) + len(test_ids) == len(examples)
    train, dev, test = [], [], []
    is_found = False
    for e in examples:
        if e['id'] in train_ids:
            ###################
            if (not is_found) and (e['label'][0] == 'Ad Populum'):
                dev.append(e)
                is_found = True
                continue
            ###################
            train.append(e)
        elif e['id'] in dev_ids:
            dev.append(e)
        elif e['id'] in test_ids:
            test.append(e)
    print(f"total = {len(examples)}; train={len(train)}; dev={len(dev)}; test={len(test)}")
    os.makedirs('./new_data', exist_ok=True)
    for sp, name in zip([train, dev, test], ['train', 'dev', 'test']):
        json.dump(sp, open(f'./new_data/{name}.json', 'w'), indent=4,)
    return

article_folder = "./train-articles/"
label_folder = "./train-labels-FLC/"
preprocess_propaganda(article_folder, label_folder)


A total of 13 labels.
{'Appeal to Fear': 0, 'Name-calling': 1, 'Straw Man': 2, 'Flag-Waving': 3, 'Whataboutism': 4, 'False Dilemma': 5, 'Red Herring': 6, 'Appeal to False Authority': 7, 'Causal Oversimplification': 8, 'Ad Populum': 9, 'Equivocation': 10, 'Reductio Ad Hitlerum': 11, 'Doubt Credibility': 12}
train
label_for_split
1     717
12    278
0     128
3     127
8     114
7      62
5      57
4      33
11     30
6      19
9       7
10      6
2       6
Name: count, dtype: int64
dev
label_for_split
1     119
12     46
0      22
3      21
8      19
5      10
7      10
4       6
11      5
6       3
2       1
9       1
10      1
Name: count, dtype: int64
test
{'Ad Populum': 1,
 'Appeal to False Authority': 10,
 'Appeal to Fear': 21,
 'Causal Oversimplification': 19,
 'Doubt Credibility': 47,
 'Equivocation': 1,
 'False Dilemma': 10,
 'Flag-Waving': 21,
 'Name-calling': 120,
 'Red Herring': 3,
 'Reductio Ad Hitlerum': 5,
 'Straw Man': 2,
 'Whataboutism': 5}
total = 2113; train=1583; dev=

In [3]:
label_dict = {
    'Black-and-White_Fallacy': 'Black-and-White Fallacy',
    'Causal_Oversimplification': 'Causal Oversimplification',
    'Doubt': 'Attack/Question Credibility', #Questioning the credibility of someone or something.
    'Exaggeration': 'Exaggeration or Minimisation',
    'Minimisation': 'Exaggeration or Minimisation',
    'Appeal_to_fear-prejudice': 'Appeal to Fear or Prejudice',
    'Flag-Waving': 'Flag-Waving',
    'AppealtoAuthority': 'Appeal to False Authority', 
    'Appeal_to_Authority': 'Appeal to False Authority', 
    'Loaded_Language': 'Loaded Language',
    'Name_Calling': 'Name Calling or Labeling',
    'Labeling': 'Name Calling or Labeling',
    'Red_Herring': 'Red Herring',
    'Reductio_ad_hitlerum': 'Reductio Ad Hitlerum',
    'Slogans': 'Slogans', 
    'Straw_Men': 'Straw Man',
    'Thought-terminating_Cliches': 'Thought-terminating Cliches',
    'Whataboutism': 'Whataboutism',
    'AppealtoEmotion': 'Appeal to Emotion', 
    'AdHominem': 'Ad Hominem', 
    'Slipperyslope': 'Slippery Slope', 
    'FalseCause': 'False Causality',
    'Repetition': 'Repetition',
    'Bandwagon': 'Bandwagon',
    'Obfuscation': 'Obfuscation',
    'Intentional_Vagueness': 'Intentional Vagueness',
    'Confusion': 'Confusion',
}

for k, v in label_dict.items():
    print(v)


Black-and-White Fallacy
Causal Oversimplification
Attack/Question Credibility
Exaggeration or Minimisation
Exaggeration or Minimisation
Appeal to Fear or Prejudice
Flag-Waving
Appeal to False Authority
Appeal to False Authority
Loaded Language
Name Calling or Labeling
Name Calling or Labeling
Red Herring
Reductio Ad Hitlerum
Slogans
Straw Man
Thought-terminating Cliches
Whataboutism
Appeal to Emotion
Ad Hominem
Slippery Slope
False Causality
Repetition
Bandwagon
Obfuscation
Intentional Vagueness
Confusion
