In [1]:
import pandas

In [2]:
def clean_string(s):
    return s.replace('#', '').replace(' ', '')

def tokens_checker(data_point: dict):
    valid = True

    tokens = data_point.get('tokens', [])

    for entity in data_point.get('entities', []):
        start = entity.get('token_start', -1)
        end = entity.get('token_end', -1)

        # Provera granica
        if 0 <= start <= end < len(tokens):
            entity_token_text = ''.join([x[0] for x in tokens[start : end + 1]])
        else:
            entity_token_text = ''
        
        entity_text = entity.get('text', '')
        token_validity = clean_string(entity_text) == clean_string(entity_token_text)
        entity['token_validity'] = token_validity
        valid &= token_validity

    data_point['token_validity'] = valid
    return data_point

In [3]:
import os
import json
import pandas as pd
from glob import glob

folder_path = '../anotations/ispravi'

all_data = []

json_files = glob(os.path.join(folder_path, '*.json'))

for file_path in json_files:
    try:
        # Učitavanje json fajla
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
        
        # Prolazak kroz sve datapoints u json fajlu
        for data_point in json_data:
            # Izvlačenje entiteta i relacija
            entities = data_point.get('entities', [])
            relations = data_point.get('relations', [])
            
            # Kreiranje novog reda sa svim originalnim podacima
            row = {
                'id': data_point.get('id'),
                'text': data_point.get('text'),
                'annotation_confidence': data_point.get('annotation_confidence'),
                'entities': entities,  # Čuvamo entitete u originalnom obliku
                'relations': relations,  # Čuvamo relacije u originalnom obliku
                'tokens': data_point.get('tokens'),  # Čuvamo i tokene ako su potrebni
                'num_entities': len(entities),  # Dodajemo broj entiteta
                'num_relations': len(relations),  # Dodajemo broj relacija
                'token_validity': data_point.get('token_validity')
            }
            
            # Dodajemo red u listu svih podataka
            all_data.append(row)
    except Exception as e:
        print(f"Greška prilikom obrade fajla {file_path}: {e}")

# Kreiranje DataFrame-a od svih podataka
df = pd.DataFrame(all_data)

In [4]:
df = df.apply(lambda x: tokens_checker(x))

In [5]:
def prikazi(data_point: dict) -> None:
    print(data_point['text'])

    tokens = data_point.get('tokens', [])

    for entity in data_point.get('entities', []):
        if entity.get('token_validity',1):
            start = entity.get('token_start', -1)
            end = entity.get('token_end', -1)
            print(f'text: {clean_string(entity.get('text',''))}')
            print(f'tokeni zahvataju: {clean_string(''.join([x[0] for x in tokens[start : end + 1]]))}')
    # for entity in data_point['entities']:

In [6]:
prikazi(df.iloc[302])
df = df[(df['entities'] != True)]

Around six o'clock in the evening the doorbell rang and Mrs. Black started screaming again. Assuming that Mundungus or some other Order member had come to call, Harry merely settled himself more comfortably against the wall of Buckbeak's room where he was hiding, trying to ignore how hungry he felt as he fed dead rats to the Hippogriff. It came as a slight shock when somebody hammered hard on the door a few minutes later.
text: Mrs.Black
tokeni zahvataju: Mrs.Black
text: Mundungus
tokeni zahvataju: Mundungus
text: Order
tokeni zahvataju: Order
text: Harry
tokeni zahvataju: Harry
text: Hippogriff
tokeni zahvataju: Hippogriff


In [7]:
def clean_string(s):
    return s.replace('#', '').replace(' ', '')

def ispravi(data_point: dict) -> dict:
    tokens = data_point.get('tokens', [])

    def dajStr(start, end):
        return clean_string(''.join([x[0] for x in tokens[start : end + 1]]))

    for entity in data_point.get('entities', []):
        if not entity.get('token_validity', 1):
            text = clean_string(entity.get('text', ''))
            if not text:  # Preskačemo prazne tekstove
                continue
            

            start = entity.get('token_start', -1)
            end = entity.get('token_end', -1)

            if start < 0 or end < 0 or start > end:
                continue

            found = False
            offset = [0, 1, -1, 2, -2, -3, 3]
            for i in offset:
                for j in offset:
                    new_start = start + i
                    new_end = end + j
                    if 0 <= new_start <= new_end < len(tokens):
                        if text == dajStr(new_start, new_end):
                            entity['token_start'] = new_start
                            entity['token_end'] = new_end
                            found = True
                            # print("USLO OVDE")
                            break
                if found:
                    break

    return data_point


In [8]:
a = ispravi(df.iloc[405])
prikazi(a)

“Hagrid,” said Hermione timidly, when he joined them at the table and started peeling his potatoes with a brutality that suggested that each tuber had done him a great personal wrong, “we really wanted to carry on with Care of Magical Creatures, you know.” Hagrid gave another great snort. Harry rather thought some bogeys landed on the potatoes, and was inwardly thankful that they were not staying for dinner.
text: Hagrid
tokeni zahvataju: Hagrid
text: CareofMagicalCreatures
tokeni zahvataju: CareofMagicalCreatures
text: Hagrid
tokeni zahvataju: Hagrid
text: Harry
tokeni zahvataju: Harry


In [9]:
resenje = []
id = 0
for index, row in df.iterrows():
    # df.iloc[index] = tokens_checker(ispravi(row))
    id+=1

    resenje += [tokens_checker(ispravi(row))]

new_df = pd.DataFrame(resenje)

In [10]:
new_df = new_df[new_df['token_validity'] == True]
new_df.head()

Unnamed: 0,id,text,annotation_confidence,entities,relations,tokens,num_entities,num_relations,token_validity
0,13221,But Professor McGonagall's voice drowned Fudge...,0.95,"[{'id': 0, 'text': 'Professor McGonagall', 'ty...",[],"[[But, 0], [Professor, 1], [M, 2], [##c, 3], [...",3,0,True
1,15761,Harry heard from Hogwarts one sunny morning ab...,0.99,"[{'id': 0, 'text': 'Harry', 'type': 'CHARACTER...","[{'type': 'located_in', 'head_id': 0, 'tail_id...","[[Harry, 0], [heard, 1], [from, 2], [Ho, 3], [...",12,17,True
2,2914,"Then, by the light of the Mark, he saw Dumbled...",0.99,"[{'id': 0, 'text': 'Dumbledore', 'type': 'CHAR...","[{'type': 'uses', 'head_id': 0, 'tail_id': 1}]","[[Then, 0], [,, 1], [by, 2], [the, 3], [light,...",3,1,True
3,10350,"At last, when the Irish team had left the box ...",0.98,"[{'id': 0, 'text': 'Aidan Lynch', 'type': 'CHA...","[{'type': 'uses', 'head_id': 2, 'tail_id': 3},...","[[At, 0], [last, 1], [,, 2], [when, 3], [the, ...",5,2,True
4,10177,"A short way farther on, they saw two little wi...",0.99,"[{'id': 0, 'text': 'Kevin', 'type': 'CHARACTER...",[],"[[A, 0], [short, 1], [way, 2], [farther, 3], [...",6,0,True


In [11]:
for index, row in df.iterrows():
    prikazi(row)
    if index == 1:
        break

But Professor McGonagall's voice drowned Fudge's. “The moment that - that thing entered the room,” she screamed, pointing at Fudge, trembling all over, “it swooped down on Crouch and - and -”
text: ProfessorMcGonagall
tokeni zahvataju: ProfessorMcGonagall
text: Fudge
tokeni zahvataju: Fudge
text: Crouch
tokeni zahvataju: Crouch
Harry heard from Hogwarts one sunny morning about a week after he had arrived at the Burrow. He and Ron went down to breakfast to find Mr. and Mrs. Weasley and Ginny already sitting at the kitchen table. The moment she saw Harry, Ginny accidentally knocked her porridge bowl to the floor with a loud clatter. Ginny seemed very prone to knocking things over whenever Harry entered a room. She dived under the table to retrieve the bowl and emerged with her face glowing like the setting sun. Pretending he hadn’t noticed this, Harry sat down and took the toast Mrs. Weasley offered him.
text: Harry
tokeni zahvataju: Harry
text: Hogwarts
tokeni zahvataju: Hogwarts
text: 

In [12]:
len(new_df)

608

In [15]:
# Assuming 'remaining_set' is your DataFrame
import math
import json

def save_to_json(output_data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

remaining_data = new_df.to_dict('records')

num_files_remaining = math.ceil(len(remaining_data) / 25)

output_folder_remaining = '../anotations/remaining_set'
os.makedirs(output_folder_remaining, exist_ok=True)

for i in range(num_files_remaining):
    start_idx = i * 25
    end_idx = min((i + 1) * 25, len(remaining_data))
    current_batch_remaining = remaining_data[start_idx:end_idx]
    
    output_file_remaining = os.path.join(output_folder_remaining, f'ispravljeni_settovi{i+1:03d}.json')

    save_to_json(current_batch_remaining, output_file_remaining)
    print(f"Sačuvan fajl {output_file_remaining} sa {len(current_batch_remaining)} datapoint-a.")

print(f"Ukupno sačuvano {num_files_remaining} JSON fajlova.")

Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi001.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi002.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi003.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi004.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi005.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi006.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi007.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi008.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi009.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi010.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/ispravljeni_settovi011.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remai