In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from drfeelgood import biomart 
bm = biomart.Biomart()
import scipy.linalg
import networkx as nx
import matplotlib.pyplot as plt 


class Diffusion(object):
    def __init__(self, genes, b_list=None):
        """
        Variables: 
        ------------
        genes = A list of genes that the user wants to invesigate. 
        b_list = A list of different biological levels.
        
        """
        self.genes = genes
        
        string = pd.read_csv("9606.protein.links.v11.0.txt", sep=' ')
        string = string[string.combined_score >= 700]
        
        matrix_ppi = np.load('heatflow_matrix_3.0.pkl' + '.npy')

        
        dt = pd.read_csv("mapped_DB_STITCH_actions_first.tsv", sep='\t')
        dt['gName'] = list(map(bm.protein_to_name, list(dt['item_id_b'].map(lambda x: x.lstrip('9606.')))))
        dtn = dt.dropna(subset=['gName'])
        
            
        self.bio_list = [x.lower() for x in b_list]
        
        DF = []
        self.gn = []
        self.dn = []
        
        if "string" in self.bio_list: 
            print("diffusion PPI's")
            gnp = list(set(string.protein1) | set(string.protein2))
            dp = dt[dt['item_id_b'].isin(list(string.protein1)) | dt['item_id_b'].isin(list(string.protein2))]
            dnp = list(set(sorted(dp.Name))) 
            filled = self.disease(gnp, dnp, matrix_ppi)
            DF.append(filled)
            self.gn.append(gnp)
            self.dn.append(dnp)
            
        self.diffl = []
        for x in DF:
            P = self.diffusion(x)
            self.diffl.append(P)
        
        
    def disease(self, gn, dn, nmatrix):
        """
        Variables:
        ----------
        gn = list of proteins that are present in the dataset defining the biological levels. 
        dn = list of drug names related to those proteins. 

        Returns:
        ----------
        nmatrix = the matrix with protein-protein interactions, drugs and the genes that are related to 
                 the disease the user wants to investigate. 
        """

        GM = {n:i for i, n in enumerate(sorted(gn))}
        
        protein_ids = list(set(map(bm.entrez_to_protein, self.genes)))
        protein_ids = [i for i in protein_ids if i]
        protein_h = ['9606.' + p for p in protein_ids]
        protein_h = set(protein_h) & set(gn) 
        
        for r in protein_h:
            nmatrix[-1, GM[r]] = 1
            nmatrix[GM[r], -1] = 1
        
        return nmatrix

    
    def laplacian(self, matrix):
        """
        Variables:
        ------------
        matrix = the matrix with proteins and drugs related to the proteins. 
        
        Returns: 
        ------------
        The laplacian matrix is returned. 
        """
        return scipy.sparse.csgraph.laplacian(matrix)
    
    def diffusion(self, matrix, beta=0.002):
        """
        Variables:
        ----------
        matrix = the matrix with proteins and drugs related to the proteins. 
        beta = how many times the matrix needs to be multiplied. (default = 0.002) 
        
        Returns:
        ----------
        P = The diffusion results with the heatflow. 
        """
        S = np.zeros(matrix.shape[0])
        S[-1] = 1
        D = sp.linalg.expm(beta*self.laplacian(matrix))
        P = np.dot(S, D)
        return P
        
    def ranking(self):
        """
        Returns:
        -----------
        ranking_l = A ranked dataframe with the drugs ranked based on the heatflow. 
        """
        ranking_l = []
        for x, y, z in zip(self.gn, self.dn, self.diffl):
            nodes = sorted(x) + sorted(y) + ["Disease"]
            heat_flow = list(zip(nodes, z))
            
            df = pd.DataFrame(heat_flow)
            df.columns = ['Nodes', 'HeatFlow']
        
            drugsdf = df[df['Nodes'].isin(y)]
        
            drugsdf['Ranking'] = drugsdf['HeatFlow'].rank(ascending=False)
        
            ranked = drugsdf.sort_values(by=['Ranking'], ascending=True)
            
            ranking_l.append(ranked)
        
        return ranking_l     
        
        

In [None]:
chemdis = pd.read_csv('CTD_chemicals_diseases.tsv', sep='\t', comment='#', names=['ChemicalName','ChemicalID','CasRN','DiseaseName','DiseaseID','DirectEvidence','InferenceGeneSymbol','InferenceScore','OmimIDs','PubMedIDs'])
chemdisT = chemdis[chemdis['DirectEvidence'] == 'therapeutic']

chemgene = pd.read_csv('CTD_chem_gene_ixns.tsv', sep='\t', comment='#', names=['ChemicalName','ChemicalID','CasRN','GeneSymbol','GeneID','GeneForms','Organism','OrganismID','Interaction','InteractionActions','PubMedIDs'])
chemgeneH = chemgene[chemgene['Organism'] == 'Homo sapiens']
chemgeneH = chemgeneH.drop(columns=['ChemicalID', 'CasRN', 'OrganismID', 'PubMedIDs'])

THmerge = pd.merge(chemgeneH, chemdisT, on='ChemicalName')
THmerge = THmerge.drop_duplicates()

In [None]:
ACR_genes = set(THmerge[THmerge['DiseaseName'] == 'Endometrial Neoplasms'].GeneID)
dif = Diffusion(ACR_genes, ['string'])
print('Make ranking')
rank = dif.ranking()

In [None]:
rank