In [2]:
from src.models import TransE
from src.model_utils import load_goa_files, generate_GOA_valid_triplets, generate_GOA_train_triplets

In [12]:
path     = 'GOA_Dataset'  
sub_dir  = ''
version  = '2018-01-01'

In [13]:
code2rel = dict({'F' :'http://purl.obolibrary.org/obo/GO_0003674',
                 'P': 'http://purl.obolibrary.org/obo/GO_0008150',
                 'C' :'http://purl.obolibrary.org/obo/GO_0005575'})

### Load data

In [15]:
df_train = load_goa_files(path+sub_dir+'/train.txt')

In [55]:
df_tpr = load_goa_files(path+sub_dir+'/valid_sc1.txt')

In [17]:
df_test = load_goa_files(path+sub_dir+'/test_sc1.txt')

### Extract relations

In [18]:
relations = []

In [19]:
for i,row in df_train.iterrows():
    if row['p'] != 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type':
        relations.append(row['p'])

In [20]:
len(relations)

118298

In [21]:
# show count of each relation
for rel in list(set(relations)):
    print(rel,' ',len(df_train[df_train['p']==rel]))

http://purl.obolibrary.org/obo/GO_0003674   25633
http://purl.obolibrary.org/obo/GO_0008150   46569
http://purl.obolibrary.org/obo/BFO_0000050   20525
http://purl.obolibrary.org/obo/GO_0005575   25571


### Train and run experiments

In [56]:
# uncomment only one of the following cases

combinations = [code2rel['F']]
#combinations = [code2rel['P']]
#combinations = [code2rel['C']]
#combinations = code2rel.values()

ret = generate_GOA_train_triplets(combinations,relations,df_train)
triplets    = ret[0]
rel_count   = ret[1]
node_count  = ret[2]
node_dict   = ret[3]
rels_dict   = ret[4]
print(node_count, rel_count, len(triplets))

tpr_triplets = generate_GOA_valid_triplets(df_tpr,combinations,node_dict,rels_dict)
print(len(tpr_triplets) , len(df_tpr))

test_triplets = generate_GOA_valid_triplets(df_test,combinations,node_dict,rels_dict)
print(len(test_triplets) , len(df_test))

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(tpr_triplets+triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

14072 1 25633
1223 6059
1314 6060
epoch 0,	 train loss 1.13
epoch 50,	 train loss 1.01
epoch 150,	 train loss 0.95
epoch 200,	 train loss 0.88
epoch 250,	 train loss 0.84
epoch 300,	 train loss 0.78
epoch 350,	 train loss 0.73
epoch 400,	 train loss 0.68
hits@1  tensor(0.0274) ,hits@10  tensor(0.2808) ,MR  tensor(21.9132) ,MRR  tensor(0.1108)
epoch 0,	 train loss 1.08
epoch 100,	 train loss 0.98
epoch 150,	 train loss 0.93
epoch 200,	 train loss 0.82
epoch 300,	 train loss 0.74
epoch 350,	 train loss 0.72
epoch 400,	 train loss 0.69
epoch 450,	 train loss 0.66
hits@1  tensor(0.0183) ,hits@10  tensor(0.2093) ,MR  tensor(27.6804) ,MRR  tensor(0.0913)


In [51]:
# uncomment only one of the following cases

#combinations = [code2rel['F']]
combinations = [code2rel['P']]
#combinations = [code2rel['C']]
#combinations = code2rel.values()

ret = generate_GOA_train_triplets(combinations,relations,df_train)
triplets    = ret[0]
rel_count   = ret[1]
node_count  = ret[2]
node_dict   = ret[3]
rels_dict   = ret[4]
print(node_count, rel_count, len(triplets))

tpr_triplets = generate_GOA_valid_triplets(df_tpr,combinations,node_dict,rels_dict)
print(len(tpr_triplets) , len(df_tpr))

test_triplets = generate_GOA_valid_triplets(df_test,combinations,node_dict,rels_dict)
print(len(test_triplets) , len(df_test))

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(tpr_triplets+triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

19187 1 46569
2081 6059
2158 6060
epoch 50,	 train loss 0.95
epoch 150,	 train loss 0.88
epoch 200,	 train loss 0.84
epoch 250,	 train loss 0.84
epoch 300,	 train loss 0.80
epoch 350,	 train loss 0.78
epoch 400,	 train loss 0.79
epoch 450,	 train loss 0.77
hits@1  tensor(0.0088) ,hits@10  tensor(0.1895) ,MR  tensor(26.8976) ,MRR  tensor(0.0790)
epoch 0,	 train loss 1.09
epoch 50,	 train loss 0.86
epoch 100,	 train loss 0.84
epoch 150,	 train loss 0.83
epoch 200,	 train loss 0.86
epoch 250,	 train loss 0.85
epoch 300,	 train loss 0.80
epoch 350,	 train loss 0.78
epoch 400,	 train loss 0.74
epoch 450,	 train loss 0.77
hits@1  tensor(0.0102) ,hits@10  tensor(0.1316) ,MR  tensor(30.6293) ,MRR  tensor(0.0669)


In [52]:
# uncomment only one of the following cases

#combinations = [code2rel['F']]
#combinations = [code2rel['P']]
combinations = [code2rel['C']]
#combinations = code2rel.values()

ret = generate_GOA_train_triplets(combinations,relations,df_train)
triplets    = ret[0]
rel_count   = ret[1]
node_count  = ret[2]
node_dict   = ret[3]
rels_dict   = ret[4]
print(node_count, rel_count, len(triplets))

tpr_triplets = generate_GOA_valid_triplets(df_tpr,combinations,node_dict,rels_dict)
print(len(tpr_triplets) , len(df_tpr))

test_triplets = generate_GOA_valid_triplets(df_test,combinations,node_dict,rels_dict)
print(len(test_triplets) , len(df_test))

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(tpr_triplets+triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

12037 1 25571
1763 6059
1663 6060
epoch 100,	 train loss 0.98
epoch 150,	 train loss 0.92
epoch 200,	 train loss 0.82
epoch 250,	 train loss 0.78
epoch 350,	 train loss 0.72
hits@1  tensor(0.0553) ,hits@10  tensor(0.4462) ,MR  tensor(23.5346) ,MRR  tensor(0.1739)
epoch 0,	 train loss 1.06
epoch 50,	 train loss 1.05
epoch 150,	 train loss 0.92
epoch 200,	 train loss 0.86
epoch 250,	 train loss 0.79
epoch 300,	 train loss 0.77
epoch 350,	 train loss 0.74
epoch 400,	 train loss 0.73
epoch 450,	 train loss 0.70
hits@1  tensor(0.0962) ,hits@10  tensor(0.5857) ,MR  tensor(15.5003) ,MRR  tensor(0.2552)


In [53]:
# uncomment only one of the following cases

#combinations = [code2rel['F']]
#combinations = [code2rel['P']]
#combinations = [code2rel['C']]
combinations = code2rel.values()

ret = generate_GOA_train_triplets(combinations,relations,df_train)
triplets    = ret[0]
rel_count   = ret[1]
node_count  = ret[2]
node_dict   = ret[3]
rels_dict   = ret[4]
print(node_count, rel_count, len(triplets))

tpr_triplets = generate_GOA_valid_triplets(df_tpr,combinations,node_dict,rels_dict)
print(len(tpr_triplets) , len(df_tpr))

test_triplets = generate_GOA_valid_triplets(df_test,combinations,node_dict,rels_dict)
print(len(test_triplets) , len(df_test))

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

model_TransE  = TransE(node_count,rel_count,emb_dim=400)
model_TransE._train(tpr_triplets+triplets,[],train_batch_size=1000,num_epoches=500);
model_TransE._eval(test_triplets)

25516 3 97773
5827 6059
5837 6060
epoch 0,	 train loss 1.11
epoch 50,	 train loss 0.74
epoch 100,	 train loss 0.71
epoch 150,	 train loss 0.70
epoch 200,	 train loss 0.69
epoch 250,	 train loss 0.69
epoch 300,	 train loss 0.68
epoch 350,	 train loss 0.68
epoch 400,	 train loss 0.67
epoch 450,	 train loss 0.67
hits@1  tensor(0.0094) ,hits@10  tensor(0.1100) ,MR  tensor(39.0730) ,MRR  tensor(0.0568)
epoch 0,	 train loss 1.59
epoch 50,	 train loss 0.72
epoch 100,	 train loss 0.69
epoch 150,	 train loss 0.70
epoch 200,	 train loss 0.73
epoch 250,	 train loss 0.70
epoch 300,	 train loss 0.69
epoch 350,	 train loss 0.67
epoch 400,	 train loss 0.73
epoch 450,	 train loss 0.65
hits@1  tensor(0.0082) ,hits@10  tensor(0.1783) ,MR  tensor(32.2431) ,MRR  tensor(0.0686)
