In [1]:
# Proverava da li je moguce da postoji relacija izmedju dva entiteta...
def validna_relacija(entity1, entity2) -> bool: 
    return entity1.get('type','') == 'CHARACTER' and entity2.get('type','') in ['CHARACTER','HOUSE','MAGIC_ITEM','SPELL','LOCATION']

In [2]:
# Proverava da li je relacija koja postoji validna!
def moguca_relacija(entity1, entity2, relation)->bool:
    if not validna_relacija(entity1,entity2):
        return False
    
    type1 = entity1.get('type')
    type2 = entity2.get('type')

    relacijeDict={
        'CHARACTER' : ['friend-of', 'enemy-of', 'mentor-of', 'student-of', 'parent-of', 'sibling-of', 'rival-of', 'ally-of'],
        'HOUSE' : ['member-of-house', 'founder-of-house'],
        'MAGIC_ITEM' : ['uses', 'owns', 'acquires', 'gives', 'given_to'],
        'SPELL' : ['casts', 'knows', 'teaches'],
        'LOCATION' : ['located_in']
    }

    return relation in relacijeDict.get(type2, [])

In [3]:
def vec_postoji_relacija(id1 : int , id2 : int, relations : list) -> bool:
    for relation in relations:
        if id1 == relation.get('head_id', -1) and id2 == relation.get('tail_id', -1):
            return True
    return False

In [4]:
import random

def pronadji(entities, i):
    for entity in entities:
        if entity.get('id', -1) == i:
            return entity
    return {}

def prepravi_moguce_relacije(data_point : dict):
    entities = data_point.get('entities',[])
    relacije = data_point.get('relations',[])
    broj_entiteta = len(entities)
    
    # pronalazimo nove moguce relacije

    nove_relacije = []
    for i in range(broj_entiteta-1):
        for j in range(i+1, broj_entiteta):

            entityI = pronadji(entities, i)
            entityJ = pronadji(entities, j)

            if validna_relacija(entityI,entityJ) and not vec_postoji_relacija(i,j, relacije):
                nove_relacije += [{'head_id':i, 'tail_id':j, 'type':'no_relation'}]
            
            if validna_relacija(entityJ,entityI) and not vec_postoji_relacija(j,i, relacije):
                nove_relacije += [{'head_id':j, 'tail_id':i, 'type':'no_relation'}]

    # ovde je potrebno shufflovati nove_relacije i izbaciti 40%!!!

    random.shuffle(nove_relacije)
    broj_za_zadrzavanje = int(len(nove_relacije) * 0.25)
    nove_relacije = nove_relacije[:broj_za_zadrzavanje]

    # iz starih brisemo one koje nisu moguce (zalutale u anotaciji)
    stare_prepravljene_relacje = []
    for relacija in relacije:
        ent1 = pronadji(entities, relacija['head_id'])
        ent2 = pronadji(entities, relacija['tail_id'])

        if moguca_relacija(ent1,ent2,relacija.get('type')):
            stare_prepravljene_relacje +=[relacija]


    data_point['relations'] = stare_prepravljene_relacje + nove_relacije
    return data_point

In [5]:
def generisi_recenice_za_datapoint(data_point):
    
    relations = data_point.get('relations', [])
    entities = data_point.get('entities',[]) 
    tkns = [x[0] for x in data_point.get('tokens',[])]
    
    def umetni(token_start, token_end, typee, num):
        tokens.insert(token_end+1, f'[/E{num}]')
        tokens.insert(token_start, f'[E{num}:{typee}]')

    recenice = []

    for relation in relations:
        tokens = tkns.copy()

        ent1 = pronadji(entities, relation.get('head_id', []))
        ent2 = pronadji(entities, relation.get('tail_id',[]))

        token_start_ent1 = ent1.get('token_start', None)
        token_end_ent1 = ent1.get('token_end', None)
        type_ent1 = ent1.get('type', '')

        token_start_ent2 = ent2.get('token_start', None)
        token_end_ent2 = ent2.get('token_end', None)
        type_ent2 = ent2.get('type', '')

        if None in [token_start_ent1, token_end_ent1, token_start_ent2, token_end_ent2]:
            continue


        if token_end_ent2 > token_end_ent1:
            umetni(token_start_ent2, token_end_ent2, type_ent2, 2)
            umetni(token_start_ent1, token_end_ent1, type_ent1, 1)
        else:
            umetni(token_start_ent1, token_end_ent1, type_ent1, 1)
            umetni(token_start_ent2, token_end_ent2, type_ent2, 2)

        recenice += [{"tokens":tokens,"relation":relation.get('type')}]
    return recenice


In [6]:
def ucitaj_u_ReFormatu(folder_path, lista = False)->list:
    import os
    import json
    from glob import glob

    all_data = []
    if not lista:
        json_files = glob(os.path.join(folder_path, '*.json'))
    else:
        json_files = folder_path
        
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
            
            for data_point in json_data:
                entities = data_point.get('entities', [])
                relations = data_point.get('relations',[])

                row = {
                    'id': data_point.get('id'),
                    'text': data_point.get('text'),
                    'entities': entities, 
                    'tokens': data_point.get('tokens'),
                    'relations': relations
                }

                all_data += generisi_recenice_za_datapoint(prepravi_moguce_relacije(row))
                
        except Exception as e:
            print(f"Greška prilikom obrade fajla {file_path}: {e}")
    return all_data

In [7]:
def sacuvaj_ReFormatu(data, putanja):
    import json
    with open(putanja, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False)

In [8]:
remaining_data = ucitaj_u_ReFormatu('../anotations/remaining_set')
random.shuffle(remaining_data)
sacuvaj_ReFormatu(remaining_data, '../anotations/re_data/train.json')

In [9]:
golden_data = ucitaj_u_ReFormatu('../anotations/remaining_set')
random.shuffle(golden_data)

no_r = []
yes_r = []

for x in golden_data:
    if x['relation'] == 'no_relation':
        no_r += [x]
    else:
        yes_r += [x]

In [10]:
val = yes_r[:len(yes_r)//2] + no_r[:len(no_r)//2]
test = yes_r[len(yes_r)//2:] + no_r[len(no_r)//2:]

In [11]:
folderi = ['../anotations/golden_set/golden_set_part_006.json', '../anotations/golden_set/golden_set_part_010.json', '../anotations/golden_set/golden_set_part_011.json']

ner_data = ucitaj_u_ReFormatu(folderi,True)
test += ner_data 

In [12]:
sacuvaj_ReFormatu(val, '../anotations/re_data/val.json')
sacuvaj_ReFormatu(test, '../anotations/re_data/test.json')