In [17]:
import numpy as np
import scipy.stats as sstats
import pandas as pd 

from collections import namedtuple

class ProteinSet(object):
    def __init__(self, proteindict, database):
        """
        Function: 
        Making a global variable of the variables that are gonna be used through the whole class. 
        
        Variables: 
        self.proteindict = a dictionary with a term and a list of proteins per item. 
        self.database = the name of the database. 
        """
        self.proteindict = { name : set(p) for name, p in proteindict.items() }
        self.database = database
    
    def enrich(self, otherset, background):
        """
        Function: 
        This function takes 3 sets of proteins (or genes) and uses them to make an enrichment using either fisher's exact test or 
        the chi2 (depending on how big the sets are). 
        
        Variables: 
        list_res = a list with lists that will later be turned into a dataframe. Eech list within the list will have information
        about a row in the table. 
        name = the name of the drug 
        pset = a set of proteins that are targets of the drug. 
        term = the name in a list. 
        proteins = the proteins in a list. 
        results = the enrichment results in a NamedTuple. 
        l_results = the enrichment results turned into a list. 
        joined = the name of the database and the term merged with the l_results list. 
        df_final = a dataframe with all the enrichment results. 
        """
        list_res = []
        for name, pset in self.proteindict.items():
            term = [self.database] + [name] 
            proteins = list(pset)
            results = self.set_enrichment(pset, otherset, background)
            l_results = list(results)
            joined = term + l_results
            joined.append(proteins)
            list_res.append(joined)
        
        df_final = pd.DataFrame(list_res)
        df_final.columns = ['Database', 'Name', 'oddsratio', 'c2statistic', 'pvalue', 'table', 'method', 'proteins'] 
        return df_final
    
    def set_enrichment(self, your_set, other_set, universe, abcd_values=False):
    
        resTuple = namedtuple("setEnrichmentResult", [ 'oddsratio', 'c2statistic', 'pvalue', 'table', 'method'])

        universe  = set(universe)
        your_set  = set(your_set) & universe
        other_set = set(other_set) & universe

        a = your_set & other_set
        b = other_set - your_set
        c = your_set - other_set
        d = universe - (your_set | other_set)

        table = [ [len(a), len(b)], [len(c), len(d)]]
        if min(min(table)) <= 5:
            method = 'fisher'
            oddsratio, p = sstats.fisher_exact(table)
            chi2 = None
        else:
            method = 'chi2'
            chi2, p, dof, expected = sstats.chi2_contingency(table)
            oddsratio = 100
            if table[1][0] > 0 and table[0][1] > 0:
                oddsratio = table[0][0] * table[1][1] / (table[1][0] * table[0][1])
            else:
                oddsratio = np.inf

        if abcd_values:
            return resTuple(oddsratio, chi2, p, [[a,b],[c,d]], method)
        else:
            return resTuple(oddsratio, chi2, p, table, method)

def make_dictio_DT(): 
    """
    Function: 
    This function makes a dictionary with a drug and a list of proteins that are targets of that drug.
    
    Variable: 
    mapped = dataset with drugs and their targets. 
    dictio = a dictionary with a drug and the corresponding targets (proteins).
    """
    mapped = pd.read_csv('mapped_DB_STITCH.tsv', sep='\t')
    mapped['protein'] = mapped['protein'].map(lambda x: x.lstrip('9606.'))
    mapped = mapped[['CID', 'InChIKey', 'DrugBank ID', 'Name', 'protein', 'combined_score']].drop_duplicates()
    
    get_entr_filtered = protein_to_entrez(mapped)
    
    dictio = {}
    for i in get_entr_filtered['Name'].unique(): 
        dictio[i] = [get_entr_filtered['Entrez'][j] for j in get_entr_filtered[get_entr_filtered['Name']==i].index]
    
    return dictio 

def protein_to_entrez(other_set1): 
    """
    Function: 
    This function takes a dataset containing ensembl protein id's and turns them into entrez gene id's. 
    
    Variables: 
    biomart_data = a dataset with entrez gene id's and their corresponding ensembl protein id's. 
    other_set1 = a list of entrez gene id's. 
    get_ens = merged dataset with other_set1 and biomart_data. 
    get_ens_filtered = the same dataset as get_ens but without some of the columns that are not important. 
    """
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "protein", "Entrez", "Uniprot", "name"])

    get_ens = pd.merge(other_set1, biomart_data, on=["protein"]) 
    get_ens = get_ens.dropna(subset=['Entrez'])
    get_ens['Entrez'] = get_ens['Entrez'].astype(int)
    get_ens_filtered = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    return get_ens_filtered


def entrez_to_protein(other_set1): 
    """
    Function: 
    This function takes a dataset containing ensembl protein id's and turns them into entrez gene id's. 
    
    Variables: 
    biomart_data = a dataset with entrez gene id's and their corresponding ensembl protein id's. 
    other_set1 = a list of entrez gene id's. 
    get_ens = merged dataset with other_set1 and biomart_data. 
    get_ens_filtered = the same dataset as get_ens but without some of the columns that are not important. 
    """
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "protein", "Entrez", "Uniprot", "name"])

    get_ens = pd.merge(other_set1, biomart_data, on=["Entrez"]) 
    get_ens = get_ens.dropna(subset=['protein'])
    get_ens_filtered = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    return get_ens_filtered

def main_ACR_DT():
    """
    Function: 
    This function calls all the functions and reads in the universe for the enrichment. 
    
    Variables: 
    ensembl = the universe used for the enrichment. 
    dictio = the dictionary of drugs and lists of corresponding targets. 
    get_ens_filtered = a dataframe with entrez gene id's and their corresponding protein id's. 
    enrichment_call = calls the class: ProteinSet(dictio)
    df = a dataframe with all the enrichment results. 
    """
    ensembl = pd.read_csv('STITCH_proteins.txt')
    get_entr_filtered_ens = protein_to_entrez(ensembl)
    
    gene_set = pd.read_csv("test_list_genes1.0.txt")
    
    dictio = make_dictio_DT()
    database = "Genes"
    enrichment_call = ProteinSet(dictio, database)
    df = enrichment_call.enrich(gene_set['Entrez'], get_entr_filtered_ens['Entrez']) 
    print(df)
    
def read_ppis():
    """
    function: 
    This function reads in the ppi dataset. 
    
    Variables: 
    protein_protein = a dataframe with all ppi's that have a higher combined_score than 0.9.
    """
    protein_protein = pd.read_csv('protein_links_v11.0_0.9.tsv', sep=' ')
    protein_protein['protein'] = protein_protein['protein'].map(lambda x: x.lstrip('9606.'))
    protein_protein['chemical'] = protein_protein['chemical'].map(lambda x: x.lstrip('9606.')) 
    #protein_protein = pd.read_csv('string_interactions.tsv', sep='\t')
    #print(protein_protein)
    #protein_protein['node2_external_id'] = protein_protein['node2_external_id'].map(lambda x: x.lstrip('9606.'))
    #protein_protein['node1_external_id'] = protein_protein['node1_external_id'].map(lambda x: x.lstrip('9606.')) 
    
    return protein_protein

def make_dictio_ppi(protein_protein, get_ensp_filtered):
    """
    Function: 
    This function maps the ppi dataset and the ageing related genes so that it can be enriched later on. 
    
    Variables: 
    dictio = a dictionary with proteins and a list of proteins that interact with this protein. 
    filtered_protein = the column with target proteins from the protein_protein dataset filtered by the ageing related genes. 
    filtered_chemical = the column with the initial protein from the protein_protein dataset filtered by the ageing related 
    genes. 
    total = filtered_protein and filtered_chemical put together into one dataset.
    """
    dictio = {}
    filtered_protein = protein_protein[protein_protein['protein'].isin(get_ensp_filtered['protein'])]
    filtered_chemical = protein_protein[protein_protein['chemical'].isin(get_ensp_filtered['protein'])]
    total = pd.concat([filtered_protein, filtered_chemical], ignore_index=True) 
    
    protein_gene = protein_to_entrez(total)
    protein_gene = protein_gene.rename(columns={"Entrez": "Entrez_protein"})
    
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "chemical", "Entrez_chemical", "Uniprot", "name"])

    get_ens = pd.merge(protein_gene, biomart_data, on=["chemical"]) 
    get_ens = get_ens.dropna(subset=['Entrez_chemical'])
    get_ens['Entrez_chemical'] = get_ens['Entrez_chemical'].astype(int)
    chem_prot_gene = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    for i in chem_prot_gene['Entrez_chemical'].unique(): 
        dictio[i] = [chem_prot_gene['Entrez_protein'][j] for j in chem_prot_gene[chem_prot_gene['Entrez_chemical']==i].index] 
    
    for x in chem_prot_gene['Entrez_protein'].unique():
        if x in dictio:
            dictio[x] += ([chem_prot_gene['Entrez_chemical'][y] for y in chem_prot_gene[chem_prot_gene['Entrez_protein']==x].index]) 
        else: 
            dictio[x] = [chem_prot_gene['Entrez_chemical'][y] for y in chem_prot_gene[chem_prot_gene['Entrez_protein']==x].index]
    
    return dictio 
    

def main_ppi():
    """
    Function: 
    This function calls all the important functions and gives them variables. 
    
    Variables: 
    protein_protein = a dataframe of protein protein interactions returned from the read_ppis() function. 
    get_ensp_filtered = a dataframe of protein ID's that are related to ageing, returned from the get_ensp_filtered() function. 
    ppi_dictionary = a dictionary with proteins and lists of proteins that interact with this protein, returned from the 
    make_dictio_ppi() function. 
    enrichment_call = calls the class: ProteinSet and gives the class the variable: ppi_dictionary. 
    df = a dataframe with the results of the enrich() function in the class: ProteinSet. 
    sort_df = sorts the dataframe on the column pvalue. 
    """
    ensembl = pd.read_csv('STITCH_proteins.txt')
    get_entr_filtered_ens = protein_to_entrez(ensembl)
    
    gene_set = pd.read_csv("test_list_genes1.0.txt")
    
    protein_protein = read_ppis() 
    
    get_ensp_filtered = entrez_to_protein(gene_set)
    ppi_dictionary = make_dictio_ppi(protein_protein, get_ensp_filtered)
    dictio = make_dictio_DT()
    enrichment_call = ProteinSet(ppi_dictionary, ppi)
    dfObj = pd.DataFrame()
    

main_ppi() 

main_ACR_DT()

     Database    Name  oddsratio c2statistic  pvalue                 table  \
0      STRING     381        NaN        None     1.0  [[0, 0], [0, 19195]]   
1      STRING    3835        NaN        None     1.0  [[0, 0], [0, 19195]]   
2      STRING    9183        NaN        None     1.0  [[0, 0], [0, 19195]]   
3      STRING     662        NaN        None     1.0  [[0, 0], [0, 19195]]   
4      STRING    1315        NaN        None     1.0  [[0, 0], [0, 19195]]   
...       ...     ...        ...         ...     ...                   ...   
8219   STRING  137872        NaN        None     1.0  [[0, 0], [0, 19195]]   
8220   STRING  164668        NaN        None     1.0  [[0, 0], [0, 19195]]   
8221   STRING  256987        NaN        None     1.0  [[0, 0], [0, 19195]]   
8222   STRING    6876        NaN        None     1.0  [[0, 0], [0, 19195]]   
8223   STRING    1837        NaN        None     1.0  [[0, 0], [0, 19195]]   

      method                                           proteins

KeyboardInterrupt: 