In [1]:
from src.models import TransE
from src.model_utils import load_goa_files, generate_GOA_valid_triplets, generate_GOA_train_triplets

In [2]:
path     = 'GOA_Dataset'  
sub_dir  = ''
version  = '2019-01-01'

In [3]:
code2rel = dict({'F' :'http://purl.obolibrary.org/obo/GO_0003674',
                 'P': 'http://purl.obolibrary.org/obo/GO_0008150',
                 'C' :'http://purl.obolibrary.org/obo/GO_0005575'})

### Load data

In [4]:
df_train = load_goa_files(path+sub_dir+'/train_'+version+'.txt')

In [5]:
df_tpr = load_goa_files(path+sub_dir+'/valid_sc1_'+version+'.txt')

In [6]:
df_test = load_goa_files(path+sub_dir+'/test_sc1_'+version+'.txt')

### Extract relations

In [7]:
relations = []

In [8]:
for i,row in df_train.iterrows():
    if row['p'] != 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type':
        relations.append(row['p'])

In [9]:
len(relations)

165642

In [10]:
# show count of each relation
for rel in list(set(relations)):
    print(rel,' ',len(df_train[df_train['p']==rel]))

http://purl.obolibrary.org/obo/GO_0005575   37197
http://purl.obolibrary.org/obo/GO_0003674   33878
http://purl.obolibrary.org/obo/BFO_0000050   29328
http://purl.obolibrary.org/obo/GO_0008150   65239


### Train and run experiments

In [11]:
# uncomment only one of the following cases

combinations = [code2rel['F']]
#combinations = [code2rel['P']]
#combinations = [code2rel['C']]
#combinations = code2rel.values()

ret = generate_GOA_train_triplets(combinations,relations,df_train)
triplets    = ret[0]
rel_count   = ret[1]
node_count  = ret[2]
node_dict   = ret[3]
rels_dict   = ret[4]
print(node_count, rel_count, len(triplets))

tpr_triplets = generate_GOA_valid_triplets(df_tpr,combinations,node_dict,rels_dict)
print(len(tpr_triplets) , len(df_tpr))

test_triplets = generate_GOA_valid_triplets(df_test,combinations,node_dict,rels_dict)
print(len(test_triplets) , len(df_test))

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(tpr_triplets+triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

16445 1 33878
1461 7888
1445 7888
epoch 0,	 train loss 1.08
epoch 100,	 train loss 1.00
epoch 150,	 train loss 0.93
epoch 200,	 train loss 0.87
epoch 250,	 train loss 0.80
epoch 350,	 train loss 0.71
epoch 400,	 train loss 0.70
epoch 450,	 train loss 0.69
hits@1  tensor(0.0256) ,hits@10  tensor(0.2567) ,MR  tensor(23.0048) ,MRR  tensor(0.1089)
epoch 0,	 train loss 1.09
epoch 50,	 train loss 1.02
epoch 100,	 train loss 0.99
epoch 150,	 train loss 0.91
epoch 200,	 train loss 0.84
epoch 250,	 train loss 0.79
epoch 300,	 train loss 0.77
epoch 350,	 train loss 0.70
epoch 400,	 train loss 0.67
hits@1  tensor(0.0180) ,hits@10  tensor(0.2941) ,MR  tensor(21.7322) ,MRR  tensor(0.1048)


In [12]:
# uncomment only one of the following cases

#combinations = [code2rel['F']]
combinations = [code2rel['P']]
#combinations = [code2rel['C']]
#combinations = code2rel.values()

ret = generate_GOA_train_triplets(combinations,relations,df_train)
triplets    = ret[0]
rel_count   = ret[1]
node_count  = ret[2]
node_dict   = ret[3]
rels_dict   = ret[4]
print(node_count, rel_count, len(triplets))

tpr_triplets = generate_GOA_valid_triplets(df_tpr,combinations,node_dict,rels_dict)
print(len(tpr_triplets) , len(df_tpr))

test_triplets = generate_GOA_valid_triplets(df_test,combinations,node_dict,rels_dict)
print(len(test_triplets) , len(df_test))

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(tpr_triplets+triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

23015 1 65239
3209 7888
3202 7888
epoch 50,	 train loss 0.89
epoch 100,	 train loss 0.92
epoch 150,	 train loss 0.87
epoch 250,	 train loss 0.88
epoch 300,	 train loss 0.85
epoch 350,	 train loss 0.86
epoch 400,	 train loss 0.83
epoch 450,	 train loss 0.82
hits@1  tensor(0.0112) ,hits@10  tensor(0.1555) ,MR  tensor(28.4029) ,MRR  tensor(0.0737)
epoch 0,	 train loss 1.06
epoch 50,	 train loss 0.89
epoch 150,	 train loss 0.89
epoch 200,	 train loss 0.81
epoch 250,	 train loss 0.87
epoch 300,	 train loss 0.83
epoch 350,	 train loss 0.82
epoch 450,	 train loss 0.81
hits@1  tensor(0.0128) ,hits@10  tensor(0.1465) ,MR  tensor(28.9447) ,MRR  tensor(0.0733)


In [13]:
# uncomment only one of the following cases

#combinations = [code2rel['F']]
#combinations = [code2rel['P']]
combinations = [code2rel['C']]
#combinations = code2rel.values()

ret = generate_GOA_train_triplets(combinations,relations,df_train)
triplets    = ret[0]
rel_count   = ret[1]
node_count  = ret[2]
node_dict   = ret[3]
rels_dict   = ret[4]
print(node_count, rel_count, len(triplets))

tpr_triplets = generate_GOA_valid_triplets(df_tpr,combinations,node_dict,rels_dict)
print(len(tpr_triplets) , len(df_tpr))

test_triplets = generate_GOA_valid_triplets(df_test,combinations,node_dict,rels_dict)
print(len(test_triplets) , len(df_test))

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(tpr_triplets+triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

15817 1 37197
2094 7888
2115 7888
epoch 50,	 train loss 1.07
epoch 100,	 train loss 1.02
epoch 150,	 train loss 1.00
epoch 200,	 train loss 0.92
epoch 250,	 train loss 0.90
epoch 300,	 train loss 0.82
epoch 350,	 train loss 0.80
epoch 400,	 train loss 0.77
epoch 450,	 train loss 0.76
hits@1  tensor(0.0312) ,hits@10  tensor(0.2813) ,MR  tensor(27.9745) ,MRR  tensor(0.1116)
epoch 0,	 train loss 1.07
epoch 50,	 train loss 1.09
epoch 100,	 train loss 1.02
epoch 150,	 train loss 1.02
epoch 250,	 train loss 0.90
epoch 300,	 train loss 0.86
epoch 350,	 train loss 0.82
epoch 400,	 train loss 0.80
epoch 450,	 train loss 0.75
hits@1  tensor(0.0199) ,hits@10  tensor(0.2865) ,MR  tensor(26.0681) ,MRR  tensor(0.1046)


In [14]:
# uncomment only one of the following cases

#combinations = [code2rel['F']]
#combinations = [code2rel['P']]
#combinations = [code2rel['C']]
combinations = code2rel.values()

ret = generate_GOA_train_triplets(combinations,relations,df_train)
triplets    = ret[0]
rel_count   = ret[1]
node_count  = ret[2]
node_dict   = ret[3]
rels_dict   = ret[4]
print(node_count, rel_count, len(triplets))

tpr_triplets = generate_GOA_valid_triplets(df_tpr,combinations,node_dict,rels_dict)
print(len(tpr_triplets) , len(df_tpr))

test_triplets = generate_GOA_valid_triplets(df_test,combinations,node_dict,rels_dict)
print(len(test_triplets) , len(df_test))

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(tpr_triplets+triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

30281 3 136314
7609 7888
7617 7888
epoch 0,	 train loss 1.02
epoch 50,	 train loss 0.81
epoch 100,	 train loss 0.71
epoch 200,	 train loss 0.69
epoch 250,	 train loss 0.73
epoch 400,	 train loss 0.67
epoch 450,	 train loss 0.67
hits@1  tensor(0.0121) ,hits@10  tensor(0.1101) ,MR  tensor(37.2691) ,MRR  tensor(0.0597)
epoch 0,	 train loss 0.95
epoch 50,	 train loss 0.76
epoch 100,	 train loss 0.72
epoch 150,	 train loss 0.73
epoch 200,	 train loss 0.74
epoch 250,	 train loss 0.71
epoch 300,	 train loss 0.74
epoch 350,	 train loss 0.70
epoch 400,	 train loss 0.71
epoch 450,	 train loss 0.70
hits@1  tensor(0.0102) ,hits@10  tensor(0.1233) ,MR  tensor(36.6224) ,MRR  tensor(0.0610)
