In [1]:
input_file = './data/presto_max_v5.4.txt'
data_dict = {} ### full_text, tokens, lemma_tokens, pos_tags, entity tags

with open(input_file, "r", encoding="utf-8") as f:
    text = f.read()

In [2]:
import json
import pandas as pd

# Input data as a list of lines
data = """
moyen	moyen	Nc	O	O	O	O	_
d'	de	S	O	O	O	O	_
y	y	Pp	O	O	O	O	_
remedier	remédier	Vvn	O	O	O	O	_
Chapitre	chapitre	Nc	O	O	O	O	_
premier	premier	Mo	O	O	O	O	_
.	.	Fs	O	O	O	O	_

BRief	bref	Ag	O	O	O	O	_
traictie	traité	Nc	O	O	O	O	_
par	par	S	O	O	O	O	_
Jehan	Jean	Np	B-pers	B-pers.ind	O	O	_
de	de	Np	I-pers	I-pers.ind	O	O	_
Flores	Flores	Np	I-pers	I-pers.ind	O	O	_
pour	pour	S	O	O	O	O	_
lequel	lequel	Pr	O	O	O	O	_
changea	changer	Vvc	O	O	O	O	_
son	son	Ds	O	O	O	O	_
""".strip().split("\n")

In [3]:
import re


def detokenize(tokens):
    ### Basic join of tokens with a space.
    ### Remove space before punctuation marks like ., : ; ! ?
    ### Handle French contractions: remove spaces after the apostrophe if needed.

    text = " ".join(tokens)
    text = re.sub(r'\s+([.,:;!?])', r'\1', text)
    text = re.sub(r"\b([dDlLQq])\s+'", r"\1'", text)
    text = re.sub(r"'\s+(\w)", r"'\1", text)
    text = re.sub(r'([\(\[\{])\s+', r'\1', text)
    text = re.sub(r'\s+([\)\]\}])', r'\1', text)

    return text


# Parse the data
sentences = []
current_sentence = []
data_dict = {}

dict = {'text': '', 'tokens': [], 'lemmas': [], 'pos': [], 'ent': []}
sentence_id = 0

for line in text.strip().split("\n"):
    parts = line.split("\t")
    tag = 'S{}'.format(sentence_id)    
    if parts == ['']:  # Sentence boundary
        if current_sentence:
            sentences.append(current_sentence)
            # dict['text'] = ' '.join(current_sentence)
            dict['text'] = detokenize(current_sentence)
            data_dict[tag] = dict
            dict = {'text': '', 'tokens': [], 'lemmas': [], 'pos': [], 'ent': []}
            
            sentence_id += 1
            current_sentence = []
    else:        
        token = parts[0]
        lemma = parts[1]
        pos_tag = parts[2]
        entity = parts[3]
        current_sentence.append(token)
        # print('i know youre up', parts[0])
        
        dict['tokens'].append(token)
        dict['lemmas'].append(lemma)
        dict['pos'].append(pos_tag)
        dict['ent'].append(entity)
        

# Append the last sentence if not already added
if current_sentence:
    sentences.append(current_sentence)
    dict['text'] = detokenize(current_sentence)
    data_dict[tag] = dict

In [None]:
import pandas as pd
import ast


df = pd.read_csv('./data/presto_max_as_csv.csv', index_col = 0)
df['ent'] = df['ent'].apply(ast.literal_eval)
df['tokens'] = df['tokens'].apply(ast.literal_eval)

In [None]:
from collections import Counter

#### Flatten all lists of entities into a single list and Count the Frequency of each entity tag
all_entities = [entity for sublist in df['ent'] for entity in sublist]
entity_counts = Counter(all_entities)
print(entity_counts)

Counter({'O': 4558670, 'B-pers': 53160, 'I-pers': 28381, 'B-loc': 25665, 'I-loc': 12279, 'I-amount': 6900, 'I-func': 6671, 'B-time': 5010, 'I-time': 4795, 'B-amount': 4460, 'B-func': 3241, 'I-org': 2169, 'B-org': 1114, 'I-event': 627, 'B-prod': 595, 'I-prod': 380, 'B-event': 306, 'Nc': 8, 'S': 7, 'B-func.ind': 6, 'Np': 3, 'B-pers.ind': 1})
