In [1]:
folder_path = '../anotations/json_files'  # Promenite ovo na vašu stvarnu putanju

In [2]:
import os
import json
import pandas as pd
from glob import glob

all_data = []

json_files = glob(os.path.join(folder_path, '*.json'))

for file_path in json_files:
    try:
        # Učitavanje json fajla
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
        
        # Prolazak kroz sve datapoints u json fajlu
        for data_point in json_data.get('data_points', []):
            # Izvlačenje entiteta i relacija
            entities = data_point.get('entities', [])
            relations = data_point.get('relations', [])
            
            # Kreiranje novog reda sa svim originalnim podacima
            row = {
                'id': data_point.get('id'),
                'text': data_point.get('text'),
                'annotation_confidence': data_point.get('annotation_confidence'),
                'entities': entities,  # Čuvamo entitete u originalnom obliku
                'relations': relations,  # Čuvamo relacije u originalnom obliku
                'tokens': data_point.get('tokens'),  # Čuvamo i tokene ako su potrebni
                'num_entities': len(entities),  # Dodajemo broj entiteta
                'num_relations': len(relations),  # Dodajemo broj relacija
                'token_validity': data_point.get('token_validity')
            }
            
            # Dodajemo red u listu svih podataka
            all_data.append(row)
            
    except Exception as e:
        print(f"Greška prilikom obrade fajla {file_path}: {e}")

# Kreiranje DataFrame-a od svih podataka
df = pd.DataFrame(all_data)

# Prikaz strukture DataFrame-a
print(df[['id', 'num_entities', 'num_relations']].head())
print(f"Ukupan broj redova: {len(df)}")
print(f"Kolone: {df.columns.tolist()}")

# Statistika o broju entiteta i relacija
print(f"Prosečan broj entiteta po podatku: {df['num_entities'].mean():.2f}")
print(f"Maksimalan broj entiteta: {df['num_entities'].max()}")
print(f"Prosečan broj relacija po podatku: {df['num_relations'].mean():.2f}")
print(f"Maksimalan broj relacija: {df['num_relations'].max()}")


     id  num_entities  num_relations
0  4259             1              0
1  4815             0              0
2  9829             2              0
3  6453             7              0
4  2732             1              0
Ukupan broj redova: 4975
Kolone: ['id', 'text', 'annotation_confidence', 'entities', 'relations', 'tokens', 'num_entities', 'num_relations', 'token_validity']
Prosečan broj entiteta po podatku: 4.09
Maksimalan broj entiteta: 22
Prosečan broj relacija po podatku: 1.04
Maksimalan broj relacija: 24


In [3]:
df = df[(df['num_entities'] > 0) | (df['num_relations'] > 0)]
df = df[(df['num_entities'] > 1)]

In [4]:
len(df[df['token_validity']==True])/len(df)

0.8253502033438771

In [5]:
len(df[(df['num_entities'] >= 5) & (df['num_relations'] >= 1) & (df['token_validity']==True)])

863

In [6]:
# Definisanje grupa bez preklapanja
mask_A = (df['num_entities'] >= 5) & (df['num_relations'] >= 1) & (df['token_validity']==True)
mask_B = (df['num_entities'] >= 3) & (df['num_relations'] == 0) & (df['token_validity']==True)
mask_C = (df['num_entities'] >= 10) & (df['token_validity']==True) & (~mask_A) & (~mask_B)
mask_D = (~mask_A) & (~mask_B) & (~mask_C)

In [7]:
group_A = df[mask_A].sample(n=min(200, sum(mask_A)), random_state=42)
group_B = df[mask_B].sample(n=min(50, sum(mask_B)), random_state=42)
group_C = df[mask_C].sample(n=min(25, sum(mask_C)), random_state=42)
group_D = df[mask_D].sample(n=min(25, sum(mask_D)), random_state=42)

In [8]:
golden_set = pd.concat([group_A, group_B, group_C, group_D])
golden_set = golden_set.sample(frac=1, random_state=42).reset_index(drop=True)
remaining = df.loc[~df.index.isin(golden_set.index)]

In [9]:
print(f"Golden set size: {len(golden_set)}")
print(f"Remaining data size: {len(remaining)}")
print(f"Total size check: {len(golden_set) + len(remaining) == len(df)}")

Golden set size: 275
Remaining data size: 4190
Total size check: False


In [10]:
import json
import os
import math

def save_to_json(output_data, output_file):
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

golden_data = golden_set.to_dict('records')

num_files = math.ceil(len(golden_data) / 25)

output_folder = '../anotations/golden_set'
os.makedirs(output_folder, exist_ok=True)

for i in range(num_files):
    start_idx = i * 25
    end_idx = min((i + 1) * 25, len(golden_data))
    current_batch = golden_data[start_idx:end_idx]
    
    output_file = os.path.join(output_folder, f'golden_set_part_{i+1:03d}.json')

    save_to_json(current_batch, output_file)
    print(f"Sačuvan fajl {output_file} sa {len(current_batch)} datapoint-a.")

print(f"Ukupno sačuvano {num_files} JSON fajlova.")

Sačuvan fajl ../anotations/golden_set/golden_set_part_001.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_002.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_003.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_004.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_005.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_006.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_007.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_008.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_009.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_010.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/golden_set/golden_set_part_011.json sa 25 datapoint-a.
Ukupno sačuvano 11 JSON fajlova.


In [18]:
ispravi = remaining[remaining['token_validity']==False]
len(ispravi)

732

In [20]:
remaining = remaining[remaining['token_validity']==True]
len(remaining)

3458

In [21]:
# Assuming 'remaining_set' is your DataFrame
remaining_data = remaining.to_dict('records')

num_files_remaining = math.ceil(len(remaining_data) / 25)

output_folder_remaining = '../anotations/remaining_set'
os.makedirs(output_folder_remaining, exist_ok=True)

for i in range(num_files_remaining):
    start_idx = i * 25
    end_idx = min((i + 1) * 25, len(remaining_data))
    current_batch_remaining = remaining_data[start_idx:end_idx]
    
    output_file_remaining = os.path.join(output_folder_remaining, f'remaining_set_part_{i+1:03d}.json')

    save_to_json(current_batch_remaining, output_file_remaining)
    print(f"Sačuvan fajl {output_file_remaining} sa {len(current_batch_remaining)} datapoint-a.")

print(f"Ukupno sačuvano {num_files_remaining} JSON fajlova.")

Sačuvan fajl ../anotations/remaining_set/remaining_set_part_001.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_002.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_003.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_004.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_005.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_006.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_007.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_008.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_009.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_010.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remaining_set/remaining_set_part_011.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/remai

In [22]:
# Assuming 'remaining_set' is your DataFrame
remaining_data = ispravi.to_dict('records')

num_files_remaining = math.ceil(len(remaining_data) / 25)

output_folder_remaining = '../anotations/ispravi'
os.makedirs(output_folder_remaining, exist_ok=True)

for i in range(num_files_remaining):
    start_idx = i * 25
    end_idx = min((i + 1) * 25, len(remaining_data))
    current_batch_remaining = remaining_data[start_idx:end_idx]
    
    output_file_remaining = os.path.join(output_folder_remaining, f'ispravi{i+1:03d}.json')

    save_to_json(current_batch_remaining, output_file_remaining)
    print(f"Sačuvan fajl {output_file_remaining} sa {len(current_batch_remaining)} datapoint-a.")

print(f"Ukupno sačuvano {num_files_remaining} JSON fajlova.")

Sačuvan fajl ../anotations/ispravi/ispravi001.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi002.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi003.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi004.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi005.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi006.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi007.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi008.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi009.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi010.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi011.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi012.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi013.json sa 25 datapoint-a.
Sačuvan fajl ../anotations/ispravi/ispravi014.json sa 25 datapoint-a.
Sačuvan fajl ../anot