In [1]:
import os
import json
import pprint
import ast
from nltk.tokenize.treebank import TreebankWordDetokenizer    

def preprocess_reddit(splits, root):
    label_map = {
        'authority': 'Appeal to Authority',
        'blackwhite': "False Dilemma",
        'hasty_generalization': "Hasty Generalization",
        'natural': 'Appeal to Nature',
        'population': 'Ad Populum',
        'slippery_slope': 'Slippery Slope',
        'tradition': 'Appeal to Tradition',
        'worse_problems': 'Appeal to Worse Problems'
    }
    all_data = {}
    fal_value_count = {}
    n_fal_span_count = {}
    label_count = {}
    for sp in splits:
        id = 0
        all_data[sp] = []
        n_fal_span_count[sp] = []
        label_count[sp] = []
        file = open(os.path.join(root, f"{sp}.tsv"), 'r') 
        # a = file.readline() 
        # # The first line consist of headings of the record, so we will store it in an array and move to next line in input_file. 
        # titles = [t.strip() for t in a.split('\t')] 
        for line in file:
            id += 1
            items = [t.strip() for t in line.split('\t')]
            tokens = ast.literal_eval(items[1])
            tk_class = ast.literal_eval(items[4])
            text_str = " ".join(TreebankWordDetokenizer().detokenize(tokens).encode('ascii', errors='ignore').strip().decode('ascii').split())
            fal_spans = []
            fal_span_labels = []
            fal_span = []
            for i, (tk, tk_label) in enumerate(zip(tokens, tk_class)):
                if i < (len(tk_class) - 1):
                    if (tk_label != "none"):
                        if (tk_class[i+1] == tk_label):
                            fal_span.append(tk)
                        else:
                            fal_span.append(tk)
                            fal_spans.append(fal_span)
                            fal_span_labels.append(tk_label)
                            fal_span = []
                else:
                    if tk_label != "none":
                        fal_span.append(tk)
                        fal_spans.append(fal_span)
                        fal_span_labels.append(tk_label)
            
            if fal_spans:            
                n_fal_span_count[sp].append(len(fal_spans))
                if len(set(fal_span_labels)) > 1:
                    print(set(fal_span_labels))
                # if len(fal_spans) > 1:
                #     print(fal_spans)
                span_strs = []
                for s_ls in fal_spans:
                    span_str = " ".join(TreebankWordDetokenizer().detokenize(s_ls).strip(" ....").encode('ascii', errors='ignore').strip().decode('ascii').split())
                    try:
                        assert span_str in text_str
                    except:
                        print(span_str)
                        print(text_str)
                        print()
                    text_str = text_str.replace(span_str, "<" + span_str + ">")
                    span_strs.append(span_str)
                #text_str = " ".join(text_str.split())
                if len(span_strs) == 1:
                    all_fal_span_str = span_strs[0]
                else:
                    all_fal_span_str = "; ".join([f'''{str(i+1)}. "{fal}"''' for i, fal in enumerate(span_strs)])
                #all_fal_span_str = " ".join(all_fal_span_str.split())
                assert len(set(fal_span_labels)) == 1
                label = list(set(fal_span_labels))[0]
                assert items[6] != "" and items[6] is not None
                if items[5] != "None":
                    text = f"Topic: {items[6]}\nComment: {items[5]}\nComment Reply: {text_str}"
                else:
                    text = f"Topic: {items[6]}\nComment: {text_str}"
                one_data = {
                    'id': id,
                    'text': text,
                    'fal_text': all_fal_span_str,
                    'label': [label_map[label]]
                }
                
                all_data[sp].append(one_data)
                label_count[sp].append(label_map[label])
    n_span_count = {}
    for sp, count_ls in n_fal_span_count.items():
        n_span_count[sp] = {}
        for c in count_ls:
            if c not in n_span_count[sp]:
                n_span_count[sp][c] = 1
            else:
                n_span_count[sp][c] += 1
    print(pprint.pformat(n_span_count))

    n_label_count = {}
    for sp, lb_ls in label_count.items():
        n_label_count[sp] = {}
        for l in lb_ls:
            if l not in n_label_count[sp]:
                n_label_count[sp][l] = 1
            else:
                n_label_count[sp][l] += 1
    print(pprint.pformat(n_label_count))
    
    json.dump(all_data['train'], open(f'train.json', 'w'), indent=4)
    json.dump(all_data['dev'] + all_data['test'], open(f'test.json', 'w'), indent=4)
    json.dump(all_data['dev'], open(f'dev.json', 'w'), indent=4)

    print(f"train: {len(all_data['train'])}")
    print(f"dev + test: {len(all_data['dev'] + all_data['test'])}")
    print(f"dev: {len(all_data['dev'])}")
    return
splits = ['train', 'test', 'dev']
preprocess_reddit(splits, root='./raw')

#第一个输出的是一个data examples中fallacy spans的数量：例子的个数

{'dev': {1: 316, 2: 25, 3: 1},
 'test': {1: 152, 2: 18, 7: 1},
 'train': {1: 1108, 2: 76, 3: 8, 4: 2, 5: 1}}
{'dev': {'Ad Populum': 39,
         'Appeal to Authority': 43,
         'Appeal to Nature': 42,
         'Appeal to Tradition': 42,
         'Appeal to Worse Problems': 48,
         'False Dilemma': 42,
         'Hasty Generalization': 40,
         'Slippery Slope': 46},
 'test': {'Ad Populum': 20,
          'Appeal to Authority': 21,
          'Appeal to Nature': 21,
          'Appeal to Tradition': 21,
          'Appeal to Worse Problems': 24,
          'False Dilemma': 21,
          'Hasty Generalization': 20,
          'Slippery Slope': 23},
 'train': {'Ad Populum': 137,
           'Appeal to Authority': 148,
           'Appeal to Nature': 145,
           'Appeal to Tradition': 147,
           'Appeal to Worse Problems': 167,
           'False Dilemma': 148,
           'Hasty Generalization': 144,
           'Slippery Slope': 159}}
train: 1195
dev + test: 513
dev: 342


In [None]:
# file = open('./dev_copy.tsv', 'r')
# a = file.readline() 
# # The first line consist of headings of the record, so we will store it in an array and move to next line in input_file. 
# first_line = [t.strip() for t in a.split('\t')]
# first_line

In [77]:
a = [1,2,3,4,5]
b =[5, 4, 3, 2,1]
for i, (m,n) in enumerate(zip(a,b)):
    print(m)
    print(n)

1
5
2
4
3
3
4
2
5
1


In [75]:
import ast
t_list = ast.literal_eval(first_line[1])
print(type(t_list))
TreebankWordDetokenizer().detokenize(t_list)

<class 'list'>


"Getting 12k is 1.5-2h of grinding. You screwed up. Nobody decided that training TV Def was a good idea but you. I don't see any reason why there should be any leeway here. What's next? People being allowed to refight a luma because they accidentally killed it / ran from it? Re-picking your starter because you decided you made the wrong decision? Reverse a breeding process because people forgot to put a strain in? Why not just implement a safe feature if you cater to the carelessness of people"

In [68]:
first_line[1].replace("\"n\'t\"", "'n\\'t'").replace("\"\'s\"", "'\\'s'").replace("', '", "\", \"").replace("['", "[\"").replace("']", "\"]")

'["Getting", "12k", "is", "1.5-2h", "of", "grinding.", "You", "screwed", "up.", "Nobody", "decided", "that", "training", "TV", "Def", "was", "a", "good", "idea", "but", "you.", "I", "do", "n\\\'t", "see", "any", "reason", "why", "there", "should", "be", "any", "leeway", "here.", "What", "\\\'s", "next", "?", "People", "being", "allowed", "to", "refight", "a", "luma", "because", "they", "accidentally", "killed", "it", "/", "ran", "from", "it", "?", "Re-picking", "your", "starter", "because", "you", "decided", "you", "made", "the", "wrong", "decision", "?", "Reverse", "a", "breeding", "process", "because", "people", "forgot", "to", "put", "a", "strain", "in", "?", "Why", "not", "just", "implement", "a", "safe", "feature", "if", "you", "cater", "to", "the", "carelessness", "of", "people"]'

In [72]:
one_dict = {
    "id": first_line[0],
    "ori_coi": first_line[2],
    "multi": ast.literal_eval(first_line[4]),
    "binary_annotations": ast.literal_eval(first_line[3]),
    "tokenized_coi": ast.literal_eval(first_line[1]),
    "parent": first_line[5],
    "title" : first_line[6]
}
one_dict

{'id': 'fjz78w3',
 'ori_coi': "Getting 12k is 1.5-2h of grinding.  You screwed up. Nobody decided that training TV Def was a good idea but you.   I don't see any reason why there should be any leeway here.   What's next? People being allowed to refight a luma because they accidentally killed it / ran from it?    Re-picking your starter because you decided you made the wrong decision?   Reverse a breeding process because people forgot to put a strain in?   Why not just implement a safe feature if you cater to the carelessness of people",
 'multi': ['none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'none',
  'slippery_slope',
  'slippery_slope',
  'slippery_slope',
  'slippery_slope',
  'slippery_slope',
  'slipp

In [63]:
#.replace("\'", "\"")
#.replace("\'", "\"")
#.replace("\'", "\"")
json_str = r"""{{
"id": "{id}", 
"tokenized_coi": {tokenized_coi}, 
"ori_coi": "{ori_coi}", 
"binary_annotations": {binary}, 
"multiclass_annotations": {multi}, 
"parent": "{parent}", 
"title": "{title}"
}}""".format(
id=first_line[0],
tokenized_coi=first_line[1].replace("\"n\'t\"", "'n\\'t'").replace("\"\'s\"", "'\\'s'").replace("', '", "\", \"").replace("['", "[\"").replace("']", "\"]"),
ori_coi=" ".join(first_line[2].split()).replace("'", "\\'").replace("/", ""),
binary=first_line[3],
multi=first_line[4],
parent=first_line[5],
title=first_line[6]
)
print(json_str)
#print(pprint.pformat(json_str))
json.loads(json_str)

{
"id": "fjz78w3", 
"tokenized_coi": ['Getting', '12k', 'is', '1.5-2h', 'of', 'grinding.', 'You', 'screwed', 'up.', 'Nobody', 'decided', 'that', 'training', 'TV', 'Def', 'was', 'a', 'good', 'idea', 'but', 'you.', 'I', 'do', 'n\'t', 'see', 'any', 'reason', 'why', 'there', 'should', 'be', 'any', 'leeway', 'here.', 'What', '\'s', 'next', '?', 'People', 'being', 'allowed', 'to', 'refight', 'a', 'luma', 'because', 'they', 'accidentally', 'killed', 'it', '/', 'ran', 'from', 'it', '?', 'Re-picking', 'your', 'starter', 'because', 'you', 'decided', 'you', 'made', 'the', 'wrong', 'decision', '?', 'Reverse', 'a', 'breeding', 'process', 'because', 'people', 'forgot', 'to', 'put', 'a', 'strain', 'in', '?', 'Why', 'not', 'just', 'implement', 'a', 'safe', 'feature', 'if', 'you', 'cater', 'to', 'the', 'carelessness', 'of', 'people'], 
"ori_coi": "Getting 12k is 1.5-2h of grinding. You screwed up. Nobody decided that training TV Def was a good idea but you. I don\'t see any reason why there should be a

JSONDecodeError: Expecting value: line 3 column 19 (char 38)

In [48]:
data = json.load(open('./dev.json'))
data[1]
json.dumps(data[1])

'{"id": 11, "title": "Kate Steinle\'s death at the hands of a Mexican national became a flashpoint in the immigration debate  here\'s the story behind her killing", "pre_text": ["San Francisco, meanwhile, has adjusted its policy to notify ICE if they are releasing suspected undocumented immigrants who face charges of serious or violent felonies.", "\\"This tragedy could have been prevented if San Francisco had simply turned the alien over to ICE as we requested, instead of releasing him back onto the streets,\\" ICE Director Thomas Homan said in a statement on Thursday."], "fal_span": "politicians across this country continue to endanger the lives of Americans with sanctuary policies while ignoring the harm inflicted on their constituents.\\"", "text": "\\"It is unconscionable that <politicians across this country continue to endanger the lives of Americans with sanctuary policies while ignoring the harm inflicted on their constituents.\\">", "post_text": ["But ICE has faced criticism 

In [66]:
case = {"text": "I'd like to help you. \"You are bad.\" She's my friend. I'dont mind.\"", "list": ['a', 'b', 'c', "she's cute"]}
json.dumps(case)

'{"text": "I\'d like to help you. \\"You are bad.\\" She\'s my friend. I\'dont mind.\\"", "list": ["a", "b", "c", "she\'s cute"]}'