In [19]:
import pandas as pd
import ast

In [20]:
df = pd.read_csv('output_test.csv')
df['A_raw_entities'][0]

"[{'entity': 'MISC', 'score': 0.99962676, 'index': 22, 'word': 'ĠCO', 'start': 108, 'end': 110}, {'entity': 'MISC', 'score': 0.9995419, 'index': 23, 'word': 'VID', 'start': 110, 'end': 113}, {'entity': 'MISC', 'score': 0.998911, 'index': 24, 'word': '-', 'start': 113, 'end': 114}, {'entity': 'MISC', 'score': 0.99878925, 'index': 25, 'word': '19', 'start': 114, 'end': 116}]"

In [21]:
def tag_A_entities(text, raw_entities):
    import ast
    try:
        entities = ast.literal_eval(raw_entities)
    except:
        return text
    
    # sort by start index to avoid messing up positions as we insert
    entities = sorted(entities, key=lambda x: x['start'])
    
    offset = 0
    for ent in entities:
        label = ent.get('entity')
        start = ent.get('start')
        end = ent.get('end')
        if start is None or end is None or not label:
            continue
        
        start += offset
        end += offset
        start_tag = f"[{label}]"
        end_tag = f"[/{label}]"
        text = text[:start] + start_tag + text[start:end] + end_tag + text[end:]
        offset += len(start_tag) + len(end_tag)
    
    return text

def tag_B_entities(text, raw_entities):
    import ast
    try:
        entities = ast.literal_eval(raw_entities)
    except:
        return text
    
    tagged = text
    offset = 0

    for ent in entities:
        word = ent.get('word')
        label = ent.get('entity')
        if not word or not label:
            continue

        start = tagged.find(word, offset)
        if start == -1:
            continue

        end = start + len(word)
        start_tag = f"[{label}]"
        end_tag = f"[/{label}]"
        tagged = tagged[:start] + start_tag + word + end_tag + tagged[end:]
        offset = end + len(start_tag) + len(end_tag)

    return tagged

In [22]:
df['A_tagged'] = df.apply(lambda row: tag_A_entities(row['statement'], row['A_raw_entities']), axis=1)
df['B_tagged'] = df.apply(lambda row: tag_B_entities(row['statement'], row['B_raw_entities']), axis=1)

In [23]:
print(df.head())
df.to_csv('AB_tagged_test.csv', index=False)

                                           statement  label  label_binary  \
0  Three doctors from the same hospital 'die sudd...      1             0   
1                      Say Joe Biden is a pedophile.      0             0   
2  A photo shows President Joe Biden and Ukrainia...      1             0   
3  It will cost $50,000 per enrollee in Obamacare...      1             0   
4  The Federal Register - which houses all Washin...      3             1   

                                      A_raw_entities  \
0  [{'entity': 'MISC', 'score': 0.99962676, 'inde...   
1  [{'entity': 'PER', 'score': 0.9993856, 'index'...   
2  [{'entity': 'PER', 'score': 0.9996147, 'index'...   
3  [{'entity': 'MISC', 'score': 0.99520916, 'inde...   
4  [{'entity': 'ORG', 'score': 0.6887246, 'index'...   

                                      B_raw_entities  \
0  [{'word': 'Three', 'entity': 'CARDINAL'}, {'wo...   
1        [{'word': 'Joe Biden', 'entity': 'PERSON'}]   
2  [{'word': 'Joe Biden', 'entit

In [24]:

entity_labels = set()

for row in df['A_raw_entities']:
    try:
        entities = ast.literal_eval(row)
    except:
        continue
    for ent in entities:
        label = ent.get('entity')
        if label:
            entity_labels.add(label)

# now create special tokens
special_tokens = []
for label in entity_labels:
    special_tokens.append(f"[{label}_]")
    special_tokens.append(f"[/{label}_]")

print(special_tokens)

['[PER_]', '[/PER_]', '[MISC_]', '[/MISC_]', '[LOC_]', '[/LOC_]', '[ORG_]', '[/ORG_]']
