In [2]:
import numpy as np
import pandas as pd
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.parsing.preprocessing import preprocess_string,remove_stopwords
import random
import warnings
from scipy.spatial.distance import pdist
warnings.filterwarnings("ignore")

In [33]:
import scipy.io
from collections import Counter
from itertools import product
from tqdm import trange
from tqdm import tqdm
from copy import deepcopy

In [77]:
def normalize(word_vec): 
    norm=np.linalg.norm(word_vec,ord=2,axis=0) 
    if norm == 0: 
         return word_vec 
    return word_vec/norm

Calculation of the similarity matrix for PECO

In [59]:
lines = pd.read_excel(r'..\data\peco_def.xlsx')
print(lines.head())
print(type(lines))       

                          peco  \
0     sodium chloride exposure   
1             unknown exposure   
2  Pseudomonas avenae exposure   
3             greenhouse study   
4       abscisic acid exposure   

                                                 def  
0  A salt exposure (PECO:0007185) involving the u...  
1  A plant exposure (PECO:0001001) where there is...  
2  The treatment involving exposure of plant to t...  
3  The treatment where the plants were grown unde...  
4  A growth hormone exposure (PECO:0007165) invol...  
<class 'pandas.core.frame.DataFrame'>


In [3]:
class DocumentDataset(object) :
    def __init__(self,data:pd.DataFrame,column) :
        document = data[column].apply(self.preprocess)
        self.documents = [TaggedDocument(text,[index])
        for index,text in document.iteritems()]

    def preprocess(self,document) :
        return preprocess_string(remove_stopwords(document))
    
    def __iter_(self):
        for document in self.documents:
            yield documents
    def tagged_documents(self,shuffle=False) :
        if shuffle:
            random.shuffle(self.documents)
        return self.documents


In [60]:
document_dataset = DocumentDataset(lines,'def')

In [7]:
docVecModel = Doc2Vec(min_count=1,
                    window=5,
                    vector_size=100,sample=1e-4,
                    negative=5,
                    workers=2)

docVecModel.build_vocab(document_dataset.tagged_documents()) 

In [62]:
docVecModel.train(document_dataset.tagged_documents(shuffle=True),
            total_examples = docVecModel.corpus_count,
            epochs=10)

In [49]:
docVecModel.save(r'..\data\pecoVecModel.d2v')

In [83]:
pl = []
for i in range(len(lines['peco'])):    
        pl.append(docVecModel[i])
        
PSSM = np.corrcoef(pl,pl)


In [89]:
np.savetxt(r'PSSM.txt',PSSM, fmt='%f', delimiter=',')

Calculation of the similarity matrix for Gene

In [4]:
lines = pd.read_excel(r'..\data\gene_name_def.xlsx')
print(lines.head())
print(len(lines))  

                                       gene  \
0                            LOC_Os01g64660   
1                            LOC_Os03g38000   
2                            LOC_Os10g20630   
3  BTH-induced ERF transcriptional factor 2   
4                    Ent-kaurene synthase 6   

                                                 def  
0  Catalysis of the reaction: D-fructose 1,6-bisp...  
1  Binding to an RNA molecule or a portion thereo...  
2                   Binding to a calcium ion (Ca2+).  
3  A transcription regulator activity that modula...  
4  Reactions, triggered in response to the presen...  
12187


In [5]:
document_dataset = DocumentDataset(lines,'def')

In [8]:
docVecModel.train(document_dataset.tagged_documents(shuffle=True),
            total_examples = docVecModel.corpus_count,
            epochs=10)

In [9]:
docVecModel.save(r'D:\MDA-GCNFTG-main\GDA\geneVecModel.d2v')

In [11]:
gl = []
for i in range(len(lines['gene'])):    
    gl.append(docVecModel[i])
        
GSSM = np.corrcoef(gl,gl)

In [20]:
np.savetxt('GSSM.txt', GSSM, fmt='%.3f')

In [25]:
peco_name = pd.read_excel('.\data\peco_name.xlsx', header = None, names = ['peco'])
peco_name['peco_idx'] = peco_name.index

gene_name = pd.read_excel('.\data\gene_name.xlsx', header = None, names = ['gene'])
gene_name['gene_idx'] = gene_name.index

known_associations = pd.read_excel('.\data\known_associations.xlsx', header = None, names = ['peco','gene'])
known_associations = pd.merge(known_associations, peco_name, on = 'peco')
known_associations =  pd.merge(known_associations, gene_name, on = 'gene')
known_associations['label'] = 1
known_associations = known_associations.drop_duplicates().reset_index(drop = True)
known_associations

Unnamed: 0,peco,gene,peco_idx,gene_idx,label
0,unknown exposure,13113.t00029,1,104,1
1,Magnaporthe grisea exposure,13113.t00029,12,104,1
2,unknown exposure,13113.t00060,1,11064,1
3,continuous dark (no light) exposure,13113.t00060,20,11064,1
4,unknown exposure,13113.t00104,1,10233,1
...,...,...,...,...,...
23553,benzothiadiazole exposure,LOC_Os10g40950,27,2372,1
23554,methyl jasmonate exposure,LOC_Os02g55990,14,4481,1
23555,methyl jasmonate exposure,LOC_Os04g44670,14,3338,1
23556,methyl jasmonate exposure,LOC_Os10g10030,14,4812,1
