# Labelling Schemes

### Imports

In [79]:
# -- public imports

import os
import pandas as pd

In [80]:
# -- private import

In [81]:
# -- dev imports
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Data

In [82]:
base_path = '../../data/UCL/dataset2'
dataset_path = os.path.join(base_path, 'df_with_other.json')
df = pd.read_json(dataset_path)
df.head()

Unnamed: 0,label_type,label,text,span_start,span_end,doc_id
0,O,Other,Should students be taught to compete or to coo...,0,503,essay001
1,T1,MajorClaim,we should attach more importance to cooperatio...,503,575,essay001
2,O,Other,".\nFirst of all,",575,591,essay001
3,T3,Claim,"through cooperation, children can learn about ...",591,714,essay001
4,O,Other,.,714,716,essay001


In [83]:
print(df.label.unique())

['Other' 'MajorClaim' 'Claim' 'Premise']


In [85]:
df_label_map = pd.DataFrame(data={
    'label': ['Other', 'MajorClaim', 'Claim', 'Premise']
}).reset_index().rename(columns={'index':'label_id'})
df_label_map

df_label_map.to_json(
    os.path.join(base_path, 'df_label_map_general.json'),
)

### Add Prediction String

In [86]:
prediction_strings = []
start_id = 1
for (label, text, doc_id) in df[['label', 'text', 'doc_id']].itertuples(index=False):
    text_split = text.split()
    end_id = start_id + len(text_split)
    prediction_strings.append(
        [num for num in range(start_id, end_id)]
    )
    start_id = end_id


In [87]:
df['predictionString'] = prediction_strings

In [88]:
df.head()

Unnamed: 0,label_type,label,text,span_start,span_end,doc_id,predictionString
0,O,Other,Should students be taught to compete or to coo...,0,503,essay001,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,T1,MajorClaim,we should attach more importance to cooperatio...,503,575,essay001,"[83, 84, 85, 86, 87, 88, 89, 90, 91, 92]"
2,O,Other,".\nFirst of all,",575,591,essay001,"[93, 94, 95, 96]"
3,T3,Claim,"through cooperation, children can learn about ...",591,714,essay001,"[97, 98, 99, 100, 101, 102, 103, 104, 105, 106..."
4,O,Other,.,714,716,essay001,[115]


### Labelling strategy

In [89]:
def _label_bio(length, label, add_end=False):
    """
    For cases where argument segment is only 1 word long, beginning given preference over end
    """
    labels = [f'I-{label}'] if label != 'Other' else ['O']
    labels *= length
    
    if add_end:
        if label != 'Other':
            labels[-1] = f'E-{label}'
    
    if label != 'Other':
        labels[0] = f'B-{label}'
        
    return labels
        
df['label_bio'] = df[['label', 'predictionString']].apply(
    lambda x: _label_bio(len(x['predictionString']), x['label']), axis=1
)
df['label_bieo'] = df[['label', 'predictionString']].apply(
    lambda x: _label_bio(len(x['predictionString']), x['label'], True), axis=1
)

In [90]:
print(df[['label_bio', 'label_bieo']].values[1])

[list(['B-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim'])
 list(['B-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'I-MajorClaim', 'E-MajorClaim'])]


In [92]:
df.to_json(
    os.path.join(base_path, 'df_labels_general.json'),
)

### Add label keys for other labelling strategies

In [93]:
def _get_label_maps(unique_labels, strategy):
    unique_labels = [label for label in unique_labels if label != 'Other']
    labels = ['O']
    if strategy == 'bio':
        for label in unique_labels:
            labels.append(f'B-{label}')
            labels.append(f'I-{label}')
    elif strategy == 'bieo':
        for label in unique_labels:
            labels.append(f'B-{label}')
            labels.append(f'I-{label}')
            labels.append(f'E-{label}')
    elif strategy == 'bixo':
        labels.append('X')
        for label in unique_labels:
            labels.append(f'B-{label}')
            labels.append(f'I-{label}')
    else:
        raise NotImplementedError(f'Strategy {strategy} has not implementation yet.')
        
    return pd.DataFrame({
        'label': labels
    }).reset_index().rename(columns={'index':'label_id'})
        
# TODO should be do a biexo as well?

In [95]:
_get_label_maps(unique_labels, 'bio').to_json(
    os.path.join(base_path, 'df_label_map_bio.json')
)
_get_label_maps(unique_labels, 'bieo').to_json(
    os.path.join(base_path, 'df_label_map_bieo.json')
)
_get_label_maps(unique_labels, 'bixo').to_json(
    os.path.join(base_path, 'df_label_map_bixo.json')
)