# mid to name FB15k-237 transformation + analysis

In [1]:
import os

In [2]:
test_file = '/Users/georgestoica/Desktop/Research/QA/qa_types/src/qa_cpg/temp/FB15k-237/FB15k-237/test.txt'
train_file = '/Users/georgestoica/Desktop/Research/QA/qa_types/src/qa_cpg/temp/FB15k-237/FB15k-237/train.txt'
files2read = [test_file]
mid2name_path = '/Users/georgestoica/Desktop/mid2name.txt'

In [3]:
def get_data_from_files(files, mid2name_dict):
    relation_data = {}
    missed_entities = set()
    for file in files:
        with open(file, 'r') as handle:
            for line in handle:
                e1, rel, e2 = line.strip().split('\t')
                if rel not in relation_data:
                    relation_data[rel] = {'e1': set(), 'e2': set()}
                    
                if e1 not in mid2name_dict:
                    missed_entities.add(e1)
                else:
                    e1 = mid2name_dict[e1]
                    relation_data[rel]['e1'].add(e1)
                
                if e2 not in mid2name_dict:
                    missed_entities.add(e2)
                else:
                    e2 = mid2name_dict[e2]
                    relation_data[rel]['e2'].add(e2)
    
    return relation_data, missed_entities

def read_mid2name(mid2name_path):
    mid2name = {}
    with open(mid2name_path, 'r', encoding="utf-8") as handle:
        for line in handle:
            try:
                mid, entity = line.strip().split('\t')
            except:
                continue
            mid2name[mid] = entity
    return mid2name

In [4]:
mid2name = read_mid2name(mid2name_path)

In [5]:
named_rel_test_data, missed_test_entities = get_data_from_files(files2read, mid2name)
named_rel_train_data, missed_train_entities = get_data_from_files([train_file], mid2name)

In [6]:
len(missed_train_entities)

178

In [8]:
for idx, relation in enumerate(named_rel_test_data.keys()):
    print(idx, relation)

0 /ice_hockey/hockey_team/current_roster./sports/sports_team_roster/position
1 /medicine/symptom/symptom_of
2 /people/cause_of_death/people
3 /education/educational_institution/students_graduates./education/education/major_field_of_study
4 /base/aareas/schema/administrative_area/administrative_area_type
5 /film/film/language
6 /education/educational_institution/students_graduates./education/education/student
7 /base/popstra/celebrity/dated./base/popstra/dated/participant
8 /film/film/edited_by
9 /people/person/sibling_s./people/sibling_relationship/sibling
10 /education/field_of_study/students_majoring./education/education/major_field_of_study
11 /film/film/release_date_s./film/film_regional_release_date/film_release_region
12 /user/alexander/philosophy/philosopher/interests
13 /sports/sports_team_location/teams
14 /education/educational_degree/people_with_this_degree./education/education/major_field_of_study
15 /film/film/runtime./film/film_cut/film_release_region
16 /location/locatio

In [9]:
relation = '/location/administrative_division/country'
named_rel_train_data[relation]['e1']

{'AU-QLD',
 'Aberdeen City Police',
 'Ahnwei',
 'Aisne (departement)',
 'Al memzar, dubai',
 'Alabama (state)',
 'Alsace, France',
 'America Samoa',
 'Amsterdam, The Netherlands',
 'An Clár (county)',
 'Ancient City of Aleppo',
 'Andhara Pradesh',
 'Anglo-Aquitanian',
 'Argyll and Bute Council',
 'Arizona department of commerce',
 'Army of Vanuatu',
 'Arunacal Prades',
 'Asia/Jakarta',
 'Assam (India)',
 'Aude (department)',
 'Austin City Connection',
 "Australia's state of Victoria",
 'Australian capital territory',
 'B. Aires',
 'Baki Sahari, Azerbaijan',
 'Ballinlough, County Meath',
 'Balochistān (Pakistan)',
 'Barcellona',
 'Barcelona (province)',
 'Bas Rhin',
 'Basse-Normandie',
 'Bavarians',
 "Bec-d'Ambès",
 'Bengal, West',
 'Bihar, India',
 'Bordelaise',
 'Bourgogne',
 'Braniborsko',
 'Bucharest, Romania',
 'Buckeye State',
 'Budapest, Hungary',
 'Buenos Aires province',
 'CA-ON',
 'Caerdydd (county borough)',
 'Cagliari Province',
 'Calatria',
 'Cambaleth',
 'Canal de las Estr

In [10]:
named_rel_train_data[relation]['e2']

{'Al Hind',
 'Al-Jumhūriyya at-Tūnisiyya',
 'Army of Vanuatu',
 'Arxentina',
 'Aussieland',
 'Austrian Republic',
 'Azerbaydzhan',
 'Belgique',
 'Brazília',
 'Cape Verdian',
 'Chinese civilisation',
 'Confédération Suisse',
 'Congo DR',
 'Denmarc',
 'Federation of Malaysia',
 'Foroyar',
 'Gurcistan',
 'ITALY',
 'Ireland/Éire',
 'Islamic Republic Of Pakistan',
 'Japón',
 'Jermany',
 'Kenadian',
 'Kingdom of Kampuchea',
 'Kingdom of Spain',
 'Kingdom of Thailand',
 'Korea (Pyongyang)',
 'Lehabim',
 'Letonia',
 'Metropolitan Netherlands',
 'Micronesia/Military',
 'N z',
 'Name of Cuba',
 'Ouzbékistan',
 'Philippine archipelago',
 'Polskor',
 'Portuga',
 'Purangsu',
 'Republic of China (1912-49)',
 'Republic of Korea',
 'Republic of Mexico',
 'Roemenië',
 'Souria',
 'South African',
 'The Kingdom of the Netherlands',
 'The Russian federation',
 'Trinidad & Tobago',
 'U.A. Emirates',
 'UK of GB and NI',
 'UKR',
 'Ungheria',
 'Unitary state of the republic of indonesia',
 'Yankee land',
 'Zi