In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

In [5]:
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

In [103]:
same_as = pd.read_csv("data/DB15K_SameAsLink.nt", " ", header=None)[[0,2]]
same_as.rename(columns={0:"FB", 2:"DB"}, inplace=True)
same_as.head()

Unnamed: 0,FB,DB
0,/m/01m4kpp,<http://dbpedia.org/resource/Andy_Griffith>
1,/m/01s47p,<http://dbpedia.org/resource/Spanish_Empire>
2,/m/0yyts,<http://dbpedia.org/resource/Driving_Miss_Daisy>
3,/m/0r6c4,"<http://dbpedia.org/resource/Mountain_View,_Ca..."
4,/m/04l_pt,<http://dbpedia.org/resource/Thai_Chinese>


In [104]:
sampled = same_as.sample(frac=0.5, replace=True, random_state=42)

In [105]:
DB = open("data/DB15K_EntityTriples.nt")
FB = open("data/FB15K_EntityTriples.nt")
DB_lines = DB.readlines()
FB_lines = FB.readlines()

In [106]:
sampled.head()

Unnamed: 0,FB,DB
7270,/m/02vkvcz,<http://dbpedia.org/resource/Shirley_Ann_Russell>
860,/m/0gg5kmg,<http://dbpedia.org/resource/Killing_Them_Softly>
5390,/m/01pk8b,<http://dbpedia.org/resource/Jeddah>
5191,/m/0mbhr,<http://dbpedia.org/resource/Joanna_Lumley>
11964,/m/0cw51,<http://dbpedia.org/resource/Bhopal>


In [107]:
# a = 0
# for index, row in sampled.iterrows():
#     #print(row["DB"], row["FB"])
#     label = row["DB"].split("/")[-1].replace(">","")
#     for i_db,l_db in enumerate(DB_lines):
#         if row["DB"] in l_db:
#             print(i_db, l_db)
#     if (a==10):
#         break
#     a += 1

In [108]:
def create_label(sample_row):
    label = sample_row["DB"].split("/")[-1].replace(">","")
    label_str = '{resource} <http://www.w3.org/2000/01/rdf-schema#label> "{label}" . \n'
    sample_row["DB_labeled"] = label_str.format(resource=sample_row["DB"], label=label)
    sample_row["FB_labeled"] = label_str.format(resource=sample_row["FB"], label=label)
    return sample_row

In [109]:
sampled = sampled.apply(lambda x: create_label(x), axis=1)
sampled.head()

Unnamed: 0,FB,DB,DB_labeled,FB_labeled
7270,/m/02vkvcz,<http://dbpedia.org/resource/Shirley_Ann_Russell>,<http://dbpedia.org/resource/Shirley_Ann_Russe...,/m/02vkvcz <http://www.w3.org/2000/01/rdf-sche...
860,/m/0gg5kmg,<http://dbpedia.org/resource/Killing_Them_Softly>,<http://dbpedia.org/resource/Killing_Them_Soft...,/m/0gg5kmg <http://www.w3.org/2000/01/rdf-sche...
5390,/m/01pk8b,<http://dbpedia.org/resource/Jeddah>,<http://dbpedia.org/resource/Jeddah> <http://w...,/m/01pk8b <http://www.w3.org/2000/01/rdf-schem...
5191,/m/0mbhr,<http://dbpedia.org/resource/Joanna_Lumley>,<http://dbpedia.org/resource/Joanna_Lumley> <h...,/m/0mbhr <http://www.w3.org/2000/01/rdf-schema...
11964,/m/0cw51,<http://dbpedia.org/resource/Bhopal>,<http://dbpedia.org/resource/Bhopal> <http://w...,/m/0cw51 <http://www.w3.org/2000/01/rdf-schema...


In [110]:
DB_lines_labeled = DB_lines + sampled["DB_labeled"].to_list()
FB_lines_labeled = FB_lines + sampled["FB_labeled"].to_list()

In [111]:
DB_label = open("test_DB.nt", "w")
FB_label = open("test_FB.nt", "w")

In [112]:
DB_label.writelines(DB_lines_labeled)
FB_label.writelines(FB_lines_labeled)

## Testing some accuracy measures

In [113]:
same_file = open("data/DB15K_SameAsLink.nt", "r")
same_list = same_file.readlines()

In [114]:
same_list = [same.replace(" <SameAs>", "").replace("<http://dbpedia.org/","dbp:")[:-4] for same in same_list]

In [115]:
same_list[:3]

['/m/01m4kpp dbp:resource/Andy_Griffith',
 '/m/01s47p dbp:resource/Spanish_Empire',
 '/m/0yyts dbp:resource/Driving_Miss_Daisy']

In [116]:
res_list = pd.read_csv("output/0.5/0/9_eqv.tsv", delimiter="\t", header=None, usecols=[0,1]).values.tolist()

In [117]:
res_list = [" ".join(x) for x in res_list]

In [118]:
res_list[:3]

['/m/027rn dbp:resource/Dominican_Republic',
 '/m/06v8s0 dbp:resource/Wendee_Lee',
 '/m/017dcd dbp:resource/Mighty_Morphin_Power_Rangers']

In [119]:
def precision(same_list, res_list):
    same_set = set(same_list)
    res_set = set(res_list)
    precision = len(same_set.intersection(res_set))/len(res_set)
    return precision

In [120]:
prec = precision(same_list, res_list)
prec

0.9792908374583291

In [121]:
def recall(same_list, res_list):
    same_set = set(same_list)
    res_set = set(res_list)
    recall = len(same_set.intersection(res_set))/len(same_set)
    return recall

In [122]:
rec = recall(same_list, res_list)
rec

0.754631791997509

In [123]:
def f1_score(precision, recall):
    f1 = 2*precision*recall/(precision + recall)
    return f1

In [124]:
f1 = f1_score(prec, rec)
f1

0.8524071224444933

## Check if we find at least all the seed

In [125]:
sampled.head()

Unnamed: 0,FB,DB,DB_labeled,FB_labeled
7270,/m/02vkvcz,<http://dbpedia.org/resource/Shirley_Ann_Russell>,<http://dbpedia.org/resource/Shirley_Ann_Russe...,/m/02vkvcz <http://www.w3.org/2000/01/rdf-sche...
860,/m/0gg5kmg,<http://dbpedia.org/resource/Killing_Them_Softly>,<http://dbpedia.org/resource/Killing_Them_Soft...,/m/0gg5kmg <http://www.w3.org/2000/01/rdf-sche...
5390,/m/01pk8b,<http://dbpedia.org/resource/Jeddah>,<http://dbpedia.org/resource/Jeddah> <http://w...,/m/01pk8b <http://www.w3.org/2000/01/rdf-schem...
5191,/m/0mbhr,<http://dbpedia.org/resource/Joanna_Lumley>,<http://dbpedia.org/resource/Joanna_Lumley> <h...,/m/0mbhr <http://www.w3.org/2000/01/rdf-schema...
11964,/m/0cw51,<http://dbpedia.org/resource/Bhopal>,<http://dbpedia.org/resource/Bhopal> <http://w...,/m/0cw51 <http://www.w3.org/2000/01/rdf-schema...


In [126]:
seed_list = sampled[["FB","DB"]].values.tolist()
seed_list[-10:]

[['/m/018ym2', '<http://dbpedia.org/resource/Plovdiv>'],
 ['/m/016ppr', "<http://dbpedia.org/resource/Destiny's_Child>"],
 ['/m/03gfvsz', '<http://dbpedia.org/resource/KYMX>'],
 ['/m/0b90_r', '<http://dbpedia.org/resource/Mexico>'],
 ['/m/0gtvpkw', '<http://dbpedia.org/resource/To_Rome_with_Love_(film)>'],
 ['/m/0r2l7', '<http://dbpedia.org/resource/Orange,_California>'],
 ['/m/0dr3sl', '<http://dbpedia.org/resource/Shrek>'],
 ['/m/03t22m', '<http://dbpedia.org/resource/Soprano_saxophone>'],
 ['/m/038rz', '<http://dbpedia.org/resource/German_cuisine>'],
 ['/m/0chnf', '<http://dbpedia.org/resource/Solaris_(operating_system)>']]

In [127]:
seed_list = [" ".join(x).replace("<http://dbpedia.org/","dbp:")[:-1] for x in seed_list]

In [128]:
print("the length of the seed_list is {}".format(len(seed_list)))
seed_list[:3]

the length of the seed_list is 6423


['/m/02vkvcz dbp:resource/Shirley_Ann_Russell',
 '/m/0gg5kmg dbp:resource/Killing_Them_Softly',
 '/m/01pk8b dbp:resource/Jeddah']

In [129]:
print("the length of the result is {}".format(len(res_list)))
res_list[:3]

the length of the result is 9899


['/m/027rn dbp:resource/Dominican_Republic',
 '/m/06v8s0 dbp:resource/Wendee_Lee',
 '/m/017dcd dbp:resource/Mighty_Morphin_Power_Rangers']

In [130]:
c = 0
not_found = []
for s in seed_list:
    if s not in res_list:
        c += 1
        not_found.append(s)
print(c)

1536


In [101]:
not_found[-10:]

['/m/03205_ dbp:resource/Adelphi_University',
 '/m/05b1062 dbp:resource/Asit_Sen_(director)',
 '/m/02ryyk dbp:resource/Republic_of_Ireland_national_football_team',
 '/m/024hbv dbp:resource/Homicide:_Life_on_the_Street',
 '/m/02g3gw dbp:resource/Saturn_Award_for_Best_Writing',
 '/m/0kn4c dbp:resource/William_Pitt_the_Younger',
 '/m/054gwt dbp:resource/The_Challenge_(TV_series)',
 '/m/0j8cb dbp:resource/Carolina_Hurricanes',
 "/m/026xxv_ dbp:resource/North_Carolina_Tar_Heels_men's_basketball",
 '/m/01tffp dbp:resource/Lebanese_Civil_War']

Credo il problema stia nel fatto che usiamo sampled, che e' preso con la stassa percentuale di sampling, ma non e' lo stesso sample usato per creare il risultato di results (quello in `res_list`). A fare in bene si dovrebbe caricare il file di input usato per generare quel result. Credo stia nella directory `data/seeded/`.

In [11]:
x = np.linspace(0,1,100)

# the function, which is y = x^2 here
precision = 0.6
y = 2 * x * precision / (x + precision)

# setting the axes at the centre
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1, 1, 1)
plt.plot(x,y, 'r')
ax.set_xbound(lower=0.0, upper=1.0)
ax.set_ybound(lower=0.0, upper=1.0)
plt.title("F1 score for fixed precision 0.6", fontsize=17)
plt.xlabel('Recall', fontsize=15)
plt.ylabel('F1 score', fontsize=15)
plt.savefig("plots/f1_score_plot.pdf")
plt.close()