# Development Notebook

### Imports

In [64]:
# -- public imports

import os
import pandas as pd
from brat_parser import get_entities_relations_attributes_groups

In [2]:
# -- private import

In [3]:
# -- dev imports

%load_ext autoreload
%autoreload 2

## Importing the dataset

In [85]:
base_path = '../../data/UCL/dataset2'
path = os.path.join(base_path, 'ArgumentAnnotatedEssays-2.0')
text_file_dir = os.path.join(path, 'brat-project-final')

texts = []
annotated_texts = []
for file in os.listdir(text_file_dir):
    essay_num, file_extension = file.split('.')
    if file_extension == 'ann':
        with open(os.path.join(text_file_dir, file), 'r') as f:
            df_temp = pd.read_csv(f, delimiter='\t', header=None, names=['label_type', 'label', 'text'])
            df_temp[['label', 'label_comp1', 'label_comp2']] = df_temp.label.str.split(expand=True)
            df_temp['doc_id'] = essay_num
            annotated_texts.append(df_temp)
    elif file_extension == 'txt':
        with open(os.path.join(text_file_dir, file), 'r') as f:
            texts.append((essay_num, f.read()))
    else:
        continue

df_texts = pd.DataFrame.from_records(texts, columns={'doc_id', 'text'})
df_annotated = pd.concat(annotated_texts)
assert sorted(df_annotated.doc_id.unique()) == sorted(df_texts.doc_id)

In [95]:
df_texts.to_csv(
    os.path.join(base_path, 'df_texts.csv'), index=None
)

df_annotated.reset_index(drop=True).to_csv(
    os.path.join(base_path, 'df_full_annotated.csv'), index=None
)

In [103]:
ids_argument_segment = df_annotated.label_type.str.startswith('T')

df_arguments = df_annotated[ids_argument_segment]

df_arguments

Unnamed: 0,label_type,label,text,label_comp1,label_comp2,doc_id
0,T1,MajorClaim,alternative means of transportation and intern...,358,464,essay031
1,T2,MajorClaim,alternative forms of transport and internation...,2393,2565,essay031
2,T3,Claim,some people claim the convenience of automobile,2325,2372,essay031
4,T4,Claim,it is crucial to alter automobiles to the othe...,822,933,essay031
6,T5,Premise,an increase in the number of automobiles bring...,561,638,essay031
...,...,...,...,...,...,...
11417,T10,Premise,"Thanks to ads we learn about new products, we ...",1209,1305,essay028
11419,T11,Claim,"These days, not only many businesses, but also...",1307,1393,essay028
11421,T12,Premise,tourism makes up one-third of the Czech Republ...,1408,1466,essay028
11422,T13,Premise,"In order to promote the country's attractions,...",1468,1595,essay028


In [110]:
df_arguments = df_arguments.rename(columns={'label_comp1':'span_start', 'label_comp2':'span_end'}).astype(
    {'span_start': int, 'span_end':int}
)
df_arguments.dtypes

label_type    object
label         object
text          object
span_start     int64
span_end       int64
doc_id        object
dtype: object

In [152]:
records = []
df_arguments = df_arguments.sort_values(['doc_id', 'span_start', 'span_end'])
for (doc_id, text) in df_texts.sort_values('doc_id').itertuples(index=False):
    df_argument = df_arguments[df_arguments.doc_id == doc_id]
    prev_span = 0
    for i, (text_segment, span_start, span_end) in enumerate(df_argument[['text', 'span_start', 'span_end']].itertuples(index=False)):
        try:
            assert text_segment == text[span_start: span_end]
        except Exception as e:
            print(f'{text_segment}\n{text[span_start: span_end]}\n')
        # all the exception were manually checked. These are because of qutoe chars, this is a hot fix.!!!! TODO
            df_arguments['text'][df_arguments.text == text_segment] = text[span_start: span_end]
        records.append(('O', 'Other', text[prev_span: span_start], prev_span, span_start, doc_id))
        prev_span = span_end
    records.append(('O', 'Other', text[prev_span:], prev_span, len(text), doc_id))

In [153]:
df_other = pd.DataFrame.from_records(records, columns=df_arguments.columns)
df_other

Unnamed: 0,label_type,label,text,span_start,span_end,doc_id
0,O,Other,Should students be taught to compete or to coo...,0,503,essay001
1,O,Other,".\nFirst of all,",575,591,essay001
2,O,Other,.,714,716,essay001
3,O,Other,.,851,853,essay001
4,O,Other,.,1086,1088,essay001
...,...,...,...,...,...,...
6486,O,Other,.,1339,1341,essay402
6487,O,Other,", so",1388,1393,essay402
6488,O,Other,". Secondly,",1436,1448,essay402
6489,O,Other,". \nIn conclusion,",1525,1543,essay402


In [154]:
df_combined = pd.concat([df_other, df_arguments])
df_combined = df_combined.sort_values(['doc_id', 'span_start', 'span_end']).reset_index(drop=True)
df_combined[df_combined.doc_id == 'essay001']

Unnamed: 0,label_type,label,text,span_start,span_end,doc_id
0,O,Other,Should students be taught to compete or to coo...,0,503,essay001
1,T1,MajorClaim,we should attach more importance to cooperatio...,503,575,essay001
2,O,Other,".\nFirst of all,",575,591,essay001
3,T3,Claim,"through cooperation, children can learn about ...",591,714,essay001
4,O,Other,.,714,716,essay001
5,T4,Premise,What we acquired from team work is not only ho...,716,851,essay001
6,O,Other,.,851,853,essay001
7,T5,Premise,"During the process of cooperation, children ca...",853,1086,essay001
8,O,Other,.,1086,1088,essay001
9,T6,Premise,All of these skills help them to get on well w...,1088,1191,essay001


In [155]:
df_combined.to_csv(
        os.path.join(base_path, 'df_with_other.csv'), index=None
)