In [1]:
import os
import nltk
#nltk.download('punkt')
import nltk.data
import json
import regex
# Reader of CONLL file
def preprocess_elecdebate(conll_paths):
    os.makedirs('./new_data', exist_ok=True)
    label_dict = {
        'AdHominem': 'Ad Hominem', 
        'AppealtoEmotion': 'Appeal to Emotion', 
        'AppealtoAuthority': 'Appeal to False Authority', 
        'Slogans': 'Slogans', 
        'Slipperyslope': 'Slippery Slope', 
        'FalseCause': 'False Causality (Post Hoc Fallacy)'
    }
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for conll_path in conll_paths:
        split = conll_path.split(".")[0]
        sentences = []
        with open(conll_path, "r") as f:
            words, marked, labels = "", "", []
            for line in f:
                line = line.strip()
                if not line:
                    #print(words)
                    #print(labels)
                    assert len(labels) == 1
                    fallacy = {
                        "id": len(sentences),
                        "pre-text": [],
                        "text": [],
                        "post-text": [],
                        "label": labels
                    }
                    words = " ".join(regex.sub(r"\s+-\s+", "-", words.encode('ascii', errors='ignore').strip().decode('ascii')).split())
                    marked = " ".join(regex.sub(r"\s+-\s+", "-", marked.encode('ascii', errors='ignore').strip().decode('ascii')).split())
                    sents = tokenizer.tokenize(words)
                    for sent in sents:
                        if sent not in marked:
                            fallacy["text"].append(sent)
                        else:
                            start = marked.find(sent)
                            if start == 0 or marked[start-1] != '|':
                                if len(fallacy["text"]) > 0:
                                    fallacy["post-text"].append(sent)
                                else:
                                    fallacy["pre-text"].append(sent)  
                            else:
                                fallacy["text"].append(sent)
                    fallacy['text'] = '<' + " ".join(fallacy['text']) + '>'
                    fallacy['pre-text'] = " ".join(fallacy['pre-text'])
                    fallacy['post-text'] = " ".join(fallacy['post-text'])
                    sentences.append(fallacy)
                    words, marked, labels = "", "", []
                else:
                    splits = line.replace("\u2019", "'").split("\t")
                    words += splits[1] + " "
                    if splits[-1][0] in ['B', 'I']:
                        marked += '|||'
                    marked += splits[1] + " "
    
                    if (splits[-1].split('-')[0] == 'B') and (len(labels) == 0 or label_dict[splits[-1].split('-')[1]] != labels[-1]):
                        labels.append(label_dict[splits[-1].split('-')[1]])
            f.close()
        
        print(f"{split}: {len(sentences)}")
        # json.dump(sentences, open(f'{split}.json', 'w'),indent=4)

        new_data = []
        fal_value_count = {}
        for s in sentences:
            assert len(s['label']) == 1
            lb = s['label'][0]
            if lb != 'Slogans':
                s['id'] = len(new_data) + 1
                new_data.append(s)
                if lb not in fal_value_count:
                    fal_value_count[lb] = 1
                else:
                    fal_value_count[lb] += 1
        print(f"{split}: {len(new_data)}")
        json.dump(new_data, open(f'./new_data/{split}.json', 'w'),indent=4)
        print(fal_value_count)
            
    return
raw_files = ['train.conll', 'test.conll', 'dev.conll']
preprocess_elecdebate(raw_files)

train: 1267
train: 1222
{'Ad Hominem': 171, 'Slippery Slope': 44, 'Appeal to Emotion': 777, 'Appeal to False Authority': 180, 'False Causality (Post Hoc Fallacy)': 50}
test: 154
test: 150
{'Ad Hominem': 21, 'Appeal to Emotion': 96, 'Appeal to False Authority': 22, 'Slippery Slope': 5, 'False Causality (Post Hoc Fallacy)': 6}
dev: 136
dev: 132
{'False Causality (Post Hoc Fallacy)': 5, 'Appeal to Emotion': 86, 'Appeal to False Authority': 18, 'Slippery Slope': 4, 'Ad Hominem': 19}


In [1]:
label_dict = {
        'AdHominem': 'Ad Hominem', 
        'AppealtoEmotion': 'Appeal to Emotion', 
        'AppealtoAuthority': 'Appeal to Authority', 
        'Slogans': 'Slogans', 
        'Slipperyslope': 'Slippery Slope', 
        'FalseCause': 'False Cause'
    }
for k, v in label_dict.items():
    print(v)

Ad Hominem
Appeal to Emotion
Appeal to Authority
Slogans
Slippery Slope
False Cause
