In [5]:
import numpy as np
import scipy.stats as sstats
import pandas as pd

from collections import namedtuple

def read_ppis():
    """
    function: 
    This function reads in the ppi dataset. 
    
    Variables: 
    protein_protein = a dataframe with all ppi's that have a higher combined_score than 0.9.
    """
    protein_protein = pd.read_csv('protein_links_v11.0_0.9.tsv', sep=' ')
    protein_protein['protein'] = protein_protein['protein'].map(lambda x: x.lstrip('9606.'))
    protein_protein['chemical'] = protein_protein['chemical'].map(lambda x: x.lstrip('9606.')) 
    
    return protein_protein

def entrez_to_protein(): 
    """
    Function: 
    This function takes a list of entrez gene ID's and turns them into protein ID's. 
    
    Variables: 
    biomart_data = a dataframe with all the data from biomart. 
    other_setg = a list of entrez gene ID's that are related to ageing. 
    get_ensp = a merged datast where biomart_data and other_setg are merged into one dataset by the protein column. 
    get_ensp_filtered = the same dataset as get_ensp but without some of the columns that are not needed. 
    """
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "protein", "Entrez", "Uniprot", "name"])
    
    other_setg = pd.read_csv("test_list_genes1.0.txt")
    get_ensp = pd.merge(other_setg, biomart_data, on=["Entrez"]) 
    get_ensp = get_ensp.dropna(subset=['protein'])
    get_ensp_filtered = get_ensp.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    return get_ensp_filtered

def make_dictio_ppi(protein_protein, get_ensp_filtered):
    """
    Function: 
    This function maps the ppi dataset and the ageing related genes so that it can be enriched later on. 
    
    Variables: 
    dictio = a dictionary with proteins and a list of proteins that interact with this protein. 
    filtered_protein = the column with target proteins from the protein_protein dataset filtered by the ageing related genes. 
    filtered_chemical = the column with the initial protein from the protein_protein dataset filtered by the ageing related 
    genes. 
    total = filtered_protein and filtered_chemical put together into one dataset.
    """
    dictio = {}
    filtered_protein = protein_protein[protein_protein['protein'].isin(get_ensp_filtered['protein'])]
    filtered_chemical = protein_protein[protein_protein['chemical'].isin(get_ensp_filtered['protein'])]
    total = pd.concat([filtered_protein, filtered_chemical], ignore_index=True) 
    
    for i in total['chemical'].unique(): 
        dictio[i] = [total['protein'][j] for j in total[total['chemical']==i].index] 
    
    for x in total['protein'].unique():
        if x in dictio:
            dictio[x] += ([total['chemical'][y] for y in total[total['protein']==x].index]) 
        else: 
            dictio[x] = [total['chemical'][y] for y in total[total['protein']==x].index]
    
    return dictio 
    

def main_ppi():
    """
    Function: 
    This function calls all the important functions and gives them variables. 
    
    Variables: 
    protein_protein = a dataframe of protein protein interactions returned from the read_ppis() function. 
    get_ensp_filtered = a dataframe of protein ID's that are related to ageing, returned from the get_ensp_filtered() function. 
    ppi_dictionary = a dictionary with proteins and lists of proteins that interact with this protein, returned from the 
    make_dictio_ppi() function. 
    enrichment_call = calls the class: ProteinSet and gives the class the variable: ppi_dictionary. 
    df = a dataframe with the results of the enrich() function in the class: ProteinSet. 
    sort_df = sorts the dataframe on the column pvalue. 
    """
    ensembl = pd.read_csv('STITCH_proteins.txt')
    protein_protein = read_ppis() 
    get_ensp_filtered = entrez_to_protein() 
    ppi_dictionary = make_dictio_ppi(protein_protein, get_ensp_filtered) 
    enrichment_call = ProteinSet(ppi_dictionary)
    df = enrichment_call.enrich(get_ensp_filtered['protein'], ensembl['protein']) 
    sort_df = df.sort_values('pvalue', ascending=1)
    print(sort_df)
    

main_ppi() 

ModuleNotFoundError: No module named 'Full_enrichment_ACR_DT'