In [8]:
import pandas as pd
import json
import regex
import random
from sklearn.model_selection import StratifiedKFold
import numpy as np

def preprocess_argotario(input_path):

    def get_gold_label(row):
        """
        output:
        0/1/2 for is_gold_example, fallacy label
        """
        try:
            intent = row['Intended Fallacy'].strip()
            voted = row['Voted Fallacy'].strip()
        except:
            print(row['Intended Fallacy'])
            print(row['Voted Fallacy'])
        n_votes = int(row['Number of Votes'])
        if n_votes >= 5:
            if (intent==voted) or (voted=="-"):
                return 1, [intent]
            elif (intent != voted) and (voted!="-"):
                return 2, [voted, intent]
        else:
            if (intent==voted) or (voted=="-"):
                return 0, [intent]
            elif (intent != voted) and (voted!="-"):
                return 0, [intent, voted]
    def get_qa(row):
        # topic = " ".join(regex.sub(r"\\u.{4}", " ", row['Topic'].strip().replace("\u2019", "'")).split())
        # text = " ".join(regex.sub(r"\\u.{4}", " ", row['Text'].strip().replace("\u2019", "'")).split())
        #f'''A: {row['Topic'].strip()}\nB: {row['Text'].strip()}'''.replace("\u2019", "'")
        topic = " ".join(row['Topic'].strip().replace("\u2019", "'").encode('ascii', errors='ignore').strip().decode('ascii').split())
        text = " ".join(row['Text'].strip().replace("\u2019", "'").encode('ascii', errors='ignore').strip().decode('ascii').split())
        return f'''A: {topic}\nB: {text}'''
        
    df = pd.read_csv(input_path,sep='\t')
    df = df.drop(df[(df['Text'].isna()) | (df['Topic'].isna())].index)
    df['Intended Fallacy'] = df['Intended Fallacy'].apply(lambda x: 'Appeal to False Authority' if x == 'Irrelevant Authority' else x)
    df['Voted Fallacy'] = df['Voted Fallacy'].apply(lambda x: 'Appeal to False Authority' if x == 'Irrelevant Authority' else x)
    df[['is_gold', 'label']] = df[['Intended Fallacy', 'Voted Fallacy', 'Number of Votes']].apply(get_gold_label, axis=1, result_type="expand")
    df['text'] = df[['Topic', 'Text']].apply(get_qa, axis=1)
    df['stance'] = df['Stance'].apply(lambda x: 1 if x=='pro' else 0)
    df['id'] = df.index
    df = df.drop(columns=['Stance', 'Topic', 'Text', 'Row number', 'Mongo ID', 'Author', 'Intended Fallacy', 'Voted Fallacy', 'Number of Votes'])
    df = df[['id', 'text', 'label', 'stance', 'is_gold']]
    map_ = {"Appeal to Emotion": 0, "Red Herring":1, "Hasty Generalization":2, "Ad Hominem":3, "Appeal to False Authority":4, "No Fallacy":5}
    df['label_for_split'] = df['label'].apply(lambda x: map_[x[0]])
    print(df['is_gold'].value_counts())
    print(df['label'].value_counts())
    print(df['stance'].value_counts())
    print(df[df['is_gold'] == 1]['label'].value_counts())
    
    # js = df.to_json(orient="records", indent=4)#lines=True
    # with open('data.json', 'w') as f:
    #     f.write(js)
    #     f.close()
    # df_gold = df[df['is_gold']==1]
    # js_gold = df_gold.to_json(orient="records", indent=4)
    # with open('data_gold.json', 'w') as f:
    #     f.write(js_gold)
    #     f.close()

    df_ng = df[df['is_gold']!=1].reset_index(drop=True)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for train_ids, dev_ids in skf.split(np.zeros(len(df_ng.index)), df_ng['label_for_split']):
        df_train, df_dev = df_ng.iloc[train_ids], df_ng.iloc[dev_ids]
        # drop the added column
        df_train = df_train.drop(columns=['label_for_split'])
        df_dev = df_dev.drop(columns=['label_for_split'])
        
        print(f"total = {len(df_ng.index)}; train={len(df_train.index)}; dev={len(df_dev.index)}\n")
        for sp, name in zip([df_train, df_dev], ['train', 'dev']):
            js = sp.to_json(orient="records", indent=4)
            with open(f'{name}.json', 'w') as f:
                f.write(js)
                f.close()
        break
    return
preprocess_argotario('./arguments-en-2018-01-15.tsv')

reannotated = json.load(open('argotario_data_gold (to 715).json', 'r')) + json.load(open('argotario_relabel(from #724).json', 'r'))
print(f"test={len(reannotated)}\n")
test_set = []
for ep in reannotated:
    ep['label'] = [ep['label']] if not isinstance(ep['label'], list) else ep['label']
    test_set.append(ep)
json.dump(test_set, open(f'test.json', 'w'), indent=4,)

is_gold
0    969
1    312
2     42
Name: count, dtype: int64
label
[No Fallacy]                                         407
[Appeal to Emotion]                                  223
[Red Herring]                                        165
[Ad Hominem]                                         152
[Hasty Generalization]                               149
[Appeal to False Authority]                          142
[No Fallacy, Red Herring]                             14
[Appeal to Emotion, No Fallacy]                        7
[Ad Hominem, No Fallacy]                               7
[Appeal to Emotion, Red Herring]                       6
[No Fallacy, Appeal to False Authority]                5
[Appeal to False Authority, Red Herring]               5
[No Fallacy, Hasty Generalization]                     5
[Ad Hominem, Red Herring]                              5
[Red Herring, No Fallacy]                              4
[Red Herring, Hasty Generalization]                    3
[Hasty Generalization

In [13]:
# df = preprocess_argotario('./arguments-en-2018-01-15.tsv')
# df['Intended Fallacy'].value_counts()
# df['Number of Votes'].value_counts()
# df[df['Number of Votes']>=5.]['Number of Votes'].value_counts()
# df1 = df[df['Number of Votes']>=5.]
# df1
# df1[df1['Intended Fallacy'] == df1['Voted Fallacy']]
# df[df['Intended Fallacy']!='No Fallacy']['Number of Votes'].value_counts()
# #df[df['Voted Fallacy']!='No Fallacy']['Number of Votes'].value_counts()
# df.columns
# df['Stance'].value_counts()
df = pd.read_csv('./arguments-en-2018-01-15.tsv',sep='\t')
df[df['Intended Fallacy'].isna()]

Unnamed: 0,Row number,Mongo ID,Author,Topic,Stance,Intended Fallacy,Voted Fallacy,Number of Votes,Text
173,Whereas this soccer-player gets like 15 euros ...,,,,,,,,
183,Read more at http://www.jamieoliver.com/news-a...,,,,,,,,
679,A daycare-service would take some pressure of ...,,,,,,,,
731,'Animals have the right to equal consideration...,,,,,,,,
732,"However, animals don’t always have the same ri...",,,,,,,,
1184,"If you start regulating private things, like s...",,,,,,,,


In [12]:
import datasets
from datasets import DatasetDict, load_dataset
#data_files = {'train': 'train.json', 'dev': 'dev.json','test': 'test.json'}
data_files = {'test': 'test.json'}
task_raw_datasets_split: datasets.DatasetDict = load_dataset('json', data_files=data_files)
if 'test' in task_raw_datasets_split:
    task_raw_datasets_split['test']

