In [1]:
import csv
import re
import collections
import random 

In [2]:
data_folder_path = '/home/nadia/Documents/CLaC-Lab/TRE/CNC-Task3/data/data/'
task_3b_file_path = data_folder_path + 'train_subtask2.csv'
task_3a_file_path = data_folder_path + 'train_subtask1.csv'
regexes = { 'arg0': [r'<ARG0>', r'</ARG0>'], 
            'arg1':[r'<ARG1>', r'</ARG1>'], 
            'sig0':[r'<SIG0>', r'</SIG0>'], 
            'sig1':[r'<SIG1>', r'</SIG1>'],
            'sig2':[r'<SIG2>', r'</SIG2>']}

In [60]:
def load_csv(path):
    with open(path) as f:
        csvreader = csv.reader(f)
        header = next(csvreader)
        rows = []
        for row in csvreader:
            rows.append(row)
    return rows, header

def extract_ann_offsets(rows, regexes):
    ann_offsets = []
    for i in range(len(rows)):
        ann_string = rows[i][6]
        raw_string = rows[i][5]
        row_indices = []
        row_indices.append(rows[i][1])
        for key in regexes.keys():
            if re.search(regexes[key][0], ann_string) is not None:
                ann_start = re.search(regexes[key][0], ann_string).span()[1]
                ann_end = re.search(regexes[key][1], ann_string).span()[0]
                substr = ann_string[ann_start:ann_end]
                for key in regexes.keys():
                    substr = substr.replace(regexes[key][0], "")
                    substr = substr.replace(regexes[key][1], "")
                substr = substr.replace("(", "\(")
                substr = substr.replace(")", "\)")
                if re.search(substr, raw_string) is not None:
                    substr_indices = re.search(substr, raw_string).span()
                    row_indices.append(str(substr_indices[0]) + ':' + str(substr_indices[1]))
                else:
                    row_indices.append(f'ERR{substr}')
            else:
                row_indices.append(':')
        ann_offsets.append(row_indices)
    return ann_offsets

def extract_doc_ids(rows, index):
    doc_ids = []
    for row in rows:
        doc_ids.append(row[index])
    return doc_ids

def generate_span_ann_header(base_header, annotated_doc_ids, regexes):
    header = base_header
    counter = collections.Counter(annotated_doc_ids)
    max_ann_sets = max(counter.values())
    for i in range(max_ann_sets):
        for key in regexes.keys():
            header.append(f"{i}_{key}")
    return header

def extend_base_anns_to_include_span_anns(header, rows, ann_offsets):
    task_3a_rows = {}
    for row in rows:
        task_3a_rows[row[0]] = row 
    for ann in ann_offsets:
        task_3a_rows[ann[0]].extend(ann[1:])
    values = list(task_3a_rows.values())
    return values

def write_csv(path, header,rows):
    with open(path, 'w') as f:
        csvwriter = csv.writer(f)
        csvwriter.writerow(header)
        csvwriter.writerows(rows)
        
def split_ids(ids, pct):
    random.seed(42)
    train_ids = []
    dev_ids = []
    for e in ids:
        if random.random() < pct:
            dev_ids.append(e)
        else:
            train_ids.append(e)
    return dev_ids, train_ids

def select_rows(ids, rows):
    task_3a_rows = {}
    for row in rows:
        task_3a_rows[row[0]] = row 
    data = []
    for e in ids:
        data.append(task_3a_rows[e])
    return data

In [63]:
task_3a_rows, task_3a_header = load_csv(task_3a_file_path)
task_3b_rows, task_3b_header = load_csv(task_3b_file_path)
ann_offsets = extract_ann_offsets(task_3b_rows, regexes)
ann_doc_ids = extract_doc_ids(task_3b_rows, 1)
all_doc_ids = extract_doc_ids(task_3a_rows, 0)
unann_doc_ids = list(set(all_doc_ids).difference(ann_doc_ids))
span_ann_header = generate_span_ann_header(task_3a_header, ann_doc_ids, regexes)
task_3a3b_rows = extend_base_anns_to_include_span_anns(span_ann_header, task_3a_rows, ann_offsets)
write_csv(data_folder_path + "subtask3a3b.csv", span_ann_header, task_3a3b_rows)

ann_dev_doc_ids, ann_train_doc_ids = split_ids(set(annotated_doc_ids), 0.15)
unann_dev_doc_ids, unann_train_doc_ids = split_ids(unannotated_doc_ids, 0.15)
dev_doc_ids = ann_dev_doc_ids + unann_dev_doc_ids
train_doc_ids = ann_train_doc_ids + unann_train_doc_ids
assert len(dev_doc_ids) + len(train_doc_ids) == len(all_doc_ids)

train_task3a3b_rows = select_rows(train_doc_ids, task_3a3b_rows)
dev_task3a3b_rows = select_rows(dev_doc_ids, task_3a3b_rows)
assert len(dev_task3a3b_rows) + len(train_task3a3b_rows) == len(task_3a3b_rows)

write_csv(data_folder_path + "dev_task3a3b.csv", span_ann_header, dev_task3a3b_rows)
nos_files = int(len(train_task3a3b_rows) / 250)
for i in range(nos_files):
    if i == nos_files:
        write_csv(data_folder_path + f"train_task3a3b_{i}.csv", span_ann_header, train_task3a3b_rows[i * 250:])
    else:
        write_csv(data_folder_path + f"train_task3a3b_{i}.csv", span_ann_header, train_task3a3b_rows[i * 250: (i + 1) * 250])

In [42]:
print(len(dev_doc_ids) + len(train_doc_ids))
print(len(all_doc_ids))

2925
2925


In [54]:

header,

['index', 'text', 'label', 'agreement', 'num_votes', 'sample_set', '0_arg0', '0_arg1', '0_sig0', '0_sig1', '0_sig2', '1_arg0', '1_arg1', '1_sig0', '1_sig1', '1_sig2', '2_arg0', '2_arg1', '2_sig0', '2_sig1', '2_sig2', '3_arg0', '3_arg1', '3_sig0', '3_sig1', '3_sig2']


In [64]:
all_doc_ids = []
for row in task_3a_rows.values():
    all_doc_ids.append(row[0])
unann_indices = list(set(all_doc_ids).difference(set(annotated_doc_ids)))
ann_indices = list(set(annotated_doc_ids))
assert len(all_doc_ids) == len(unann_indices) + len(ann_indices)

In [70]:
random.seed(42)
if random.random() < 0.2:
    dev_unann_indices = 

0.6394267984578837