In [None]:
import pandas as pd 
import pandas as pd
import numpy as np
import scipy.stats as sstats

from collections import namedtuple

class ProteinSet(object):
    def __init__(self, proteindict, database):
        """
        Function: 
        Making a global variable of the variables that are gonna be used through the whole class. 
        
        Variables: 
        self.proteindict = a dictionary with a term and a list of proteins per item. 
        self.database = the name of the database. 
        """
        self.proteindict = { name : set(p) for name, p in proteindict.items() }
        self.database = database
    
    def enrich(self, otherset, background):
        """
        Function: 
        This function takes 3 sets of proteins (or genes) and uses them to make an enrichment using either fisher's exact test or 
        the chi2 (depending on how big the sets are). 
        
        Variables: 
        list_res = a list with lists that will later be turned into a dataframe. Eech list within the list will have information
        about a row in the table. 
        name = the name of the drug 
        pset = a set of proteins that are targets of the drug. 
        term = the name in a list. 
        proteins = the proteins in a list. 
        results = the enrichment results in a NamedTuple. 
        l_results = the enrichment results turned into a list. 
        joined = the name of the database and the term merged with the l_results list. 
        df_final = a dataframe with all the enrichment results. 
        """
        list_res = []
        for name, pset in self.proteindict.items():
            term = [self.database] + [name] 
            proteins = list(pset)
            results = self.set_enrichment(pset, otherset, background)
            l_results = list(results)
            joined = term + l_results
            joined.append(proteins)
            list_res.append(joined)
        
        df_final = pd.DataFrame(list_res)
        df_final.columns = ['Database', 'Name', 'oddsratio', 'c2statistic', 'pvalue', 'table', 'method', 'proteins'] 
        return df_final
    
    def set_enrichment(self, your_set, other_set, universe, abcd_values=False):
    
        resTuple = namedtuple("setEnrichmentResult", [ 'oddsratio', 'c2statistic', 'pvalue', 'table', 'method'])

        universe  = set(universe)
        your_set  = set(your_set) & universe
        other_set = set(other_set) & universe

        a = your_set & other_set
        b = other_set - your_set
        c = your_set - other_set
        d = universe - (your_set | other_set)

        table = [ [len(a), len(b)], [len(c), len(d)]]
        if min(min(table)) <= 5:
            method = 'fisher'
            oddsratio, p = sstats.fisher_exact(table)
            chi2 = None
        else:
            method = 'chi2'
            chi2, p, dof, expected = sstats.chi2_contingency(table)
            oddsratio = 100
            if table[1][0] > 0 and table[0][1] > 0:
                oddsratio = table[0][0] * table[1][1] / (table[1][0] * table[0][1])
            else:
                oddsratio = np.inf

        if abcd_values:
            return resTuple(oddsratio, chi2, p, [[a,b],[c,d]], method)
        else:
            return resTuple(oddsratio, chi2, p, table, method)
        
def protein_to_entrez(other_set1): 
    """
    Function: 
    This function takes a dataset containing ensembl protein id's and turns them into entrez gene id's. 
    
    Variables: 
    biomart_data = a dataset with entrez gene id's and their corresponding ensembl protein id's. 
    other_set1 = a list of entrez gene id's. 
    get_ens = merged dataset with other_set1 and biomart_data. 
    get_ens_filtered = the same dataset as get_ens but without some of the columns that are not important. 
    """
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "protein", "Entrez", "Uniprot", "name"])

    get_ens = pd.merge(other_set1, biomart_data, on=["protein"]) 
    get_ens = get_ens.dropna(subset=['Entrez'])
    get_ens['Entrez'] = get_ens['Entrez'].astype(int)
    get_ens_filtered = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    return get_ens_filtered

def make_dictio_DT(): 
    """
    Function: 
    This function makes a dictionary with a drug and a list of proteins that are targets of that drug.
    
    Variable: 
    mapped = dataset with drugs and their targets. 
    dictio = a dictionary with a drug and the corresponding targets (proteins).
    """
    mapped = pd.read_csv('mapped_DB_STITCH.tsv', sep='\t')
    mapped['protein'] = mapped['protein'].map(lambda x: x.lstrip('9606.'))
    mapped = mapped[['CID', 'InChIKey', 'DrugBank ID', 'Name', 'protein', 'combined_score']].drop_duplicates()
    
    get_entr_filtered = protein_to_entrez(mapped)
    
    dictio = {}
    for i in get_entr_filtered['Name'].unique(): 
        dictio[i] = [get_entr_filtered['Entrez'][j] for j in get_entr_filtered[get_entr_filtered['Name']==i].index]
    
    return dictio 

def KEGG_data(): 
    """
    Function: 
    Reads in the enrichment results of the first enrichment and turns it into a dictionary. 
    
    Variable:
    KEGG_ACR_results = reads in the enrichment results of the first enrichment. 
    empty_lijst_KEGG = list of gene ID's per KEGG pathway. 
    per_pathway = list of gene ID's per KEGG pathway.
    gene_list = the gene ID's seperated by '/'
    dictionary = a dictionary with the KEGG pathway and the list of gene ID's that have something to do with the 
    KEGG pathway. 
    
    """
    
    KEGG_ACR_results = pd.read_csv("KEGGACR_enrichment_results.csv")
    
    empty_lijst_KEGG = []
    for x in KEGG_ACR_results['geneID']:
        per_pathway = []
        gene_list = x.split("/")
        per_pathway += gene_list
        per_pathway = list(map(int, per_pathway))
        empty_lijst_KEGG.append(per_pathway)
    KEGG_ACR_results['listID'] = empty_lijst_KEGG
    
    dictionary = {} 
    for index, row in KEGG_ACR_results.iterrows():
        dictionary[row['ID']] = row['listID']
        
    return dictionary

def GO_BP_data(): 
    """
    Function: 
    Reads in the enrichment results of the first enrichment and turns it into a dictionary. 
    
    Variable: 
    GO_BP_ACR_results = reads in enrichment results of the first enrichment. 
    empty_lijst_GO_BP = list of gene ID's per biological process. 
    per_pathway = list of gene ID's for one biological process. 
    gene_list = the gene ID's seperated by '/'
    dictionary = a dictionary with the biological process and the list of gene ID's that have something to do with 
    the biological process. 
    """
        
    GO_BP_ACR_results = pd.read_csv("GO_BPACR_enrichment_results.csv")
    
    empty_lijst_GO_BP = []
    for x in GO_BP_ACR_results['geneID']:
        per_pathway = []
        gene_list = x.split("/")
        per_pathway += gene_list
        per_pathway = list(map(int, per_pathway))
        empty_lijst_GO_BP.append(per_pathway)
    GO_BP_ACR_results['listID'] = empty_lijst_GO_BP
    
    dictionary = {} 
    for index, row in GO_BP_ACR_results.iterrows():
        dictionary[row['ID']] = row['listID']
        
    return dictionary

def GO_CC_data():
    """
    Function: 
    Reads in the enrichment results of the first enrichment and turns it into a dictionary. 
    
    Variable: 
    GO_CC_ACR_results = reads in the enrichment results. 
    empty_lijst_GO_CC = list of gene ID's per cellular component. 
    per_pathway = list of gene ID's for one cellular component. 
    gene_list = the gene ID's seperated by '/'. 
    dictionary = a dictionary with the cellular component and the list of gene ID's that have something to do with 
    the cellular component. 
    """
            
    GO_CC_ACR_results = pd.read_csv("GO_CCACR_enrichment_results.csv")
    
    empty_lijst_GO_CC = []
    for x in GO_CC_ACR_results['geneID']:
        per_pathway = []
        gene_list = x.split("/")
        per_pathway += gene_list
        per_pathway = list(map(int, per_pathway))
        empty_lijst_GO_CC.append(per_pathway)
    GO_CC_ACR_results['listID'] = empty_lijst_GO_CC
    
    dictionary = {} 
    for index, row in GO_CC_ACR_results.iterrows():
        dictionary[row['ID']] = row['listID']
        
    return dictionary

def GO_MF_data(): 
    """
    Function:
    Reads in the enrichment results of the first enrichment and turns it into a dictionary. 
    
    Variable: 
    GO_MF_ACR_results = reads in the enrichment results. 
    empty_lijst_GO_MF = list of gene ID's per molecular function.
    per_pathway = list of gene ID's for one molecular function. 
    gene_list = the gene ID's seperated by '/'
    dictionary = dictionary with the molecular function and the list of gene ID's. 
    """
    
    GO_MF_ACR_results = pd.read_csv("GO_MFACR_enrichment_results.csv")
    
    empty_lijst_GO_MF = []
    for x in GO_MF_ACR_results['geneID']:
        per_pathway = []
        gene_list = x.split("/")
        per_pathway += gene_list
        per_pathway = list(map(int, per_pathway))
        empty_lijst_GO_MF.append(per_pathway)
    GO_MF_ACR_results['listID'] = empty_lijst_GO_MF
    
    dictionary = {} 
    for index, row in GO_MF_ACR_results.iterrows():
        dictionary[row['ID']] = row['listID']
        
    return dictionary

def reactome_data():
    """
    Function: 
    Reads in the enrichment results of the first enrichment and turns it into a dictionary. 
    
    Variable: 
    reactome_ACR_results = reads in the dataset with the results of the first enrichment. 
    biomart_data = reads in the dataset with biomart data. 
    empty_lijst_reactome = list of gene ID's per pathway. 
    dictionary = a dictionary with a pathway and a list of genes that have something to do with that pathway.
    """
    
    reactome_ACR_results = pd.read_csv("reactomeACR_enrichment_results.csv")
    
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "protein", "Entrez", "Uniprot", "name"])
    
    empty_lijst_reactome = []
    for x in reactome_ACR_results['geneID']:
        per_pathway = []
        gene_list = x.split("/")
        biomart_filtered = biomart_data[biomart_data['name'].isin(gene_list)]
        biomart_filtered['Entrez']=biomart_filtered['Entrez'].astype(int)
        just_entrez = list(biomart_filtered['Entrez'].unique()) 
        per_pathway += just_entrez
        empty_lijst_reactome.append(per_pathway)
    reactome_ACR_results['listID'] = empty_lijst_reactome
    
    dictionary = {} 
    for index, row in reactome_ACR_results.iterrows():
        dictionary[row['ID']] = row['listID']
        
    return dictionary

def entrez_to_protein(other_set1): 
    """
    Function: 
    This function takes a dataset containing Entrez gene id's and turns them into ensembl protein id's. 
    
    Variables: 
    biomart_data = a dataset with entrez gene id's and their corresponding ensembl protein id's. 
    other_set1 = a list of entrez gene id's. 
    get_ens = merged dataset with other_set1 and biomart_data. 
    get_ens_filtered = the same dataset as get_ens but without some of the columns that are not important. 
    """
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "protein", "Entrez", "Uniprot", "name"])

    get_ens = pd.merge(other_set1, biomart_data, on=["Entrez"]) 
    get_ens = get_ens.dropna(subset=['protein'])
    get_ens_filtered = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    return get_ens_filtered

def main_ACR_DT():
    """
    Function: 
    This function calls all the functions and reads in the universe for the enrichment. 
    
    Variables: 
    ensembl = the universe used for the enrichment. 
    dictio = the dictionary of drugs and lists of corresponding targets. 
    get_ens_filtered = a dataframe with entrez gene id's and their corresponding protein id's. 
    enrichment_call = calls the class: ProteinSet(dictio)
    df = a dataframe with all the enrichment results. 
    """
    ensembl = pd.read_csv('STITCH_proteins.txt')
    get_entr_filtered_ens = protein_to_entrez(ensembl)
    
    gene_set = pd.read_csv("test_list_genes1.0.txt")
    
    dictio = make_dictio_DT()
    database = "Genes"
    enrichment_call = ProteinSet(dictio, database)
    df = enrichment_call.enrich(gene_set['Entrez'], get_entr_filtered_ens['Entrez']) 
    
    return df
    
def read_ppis():
    """
    function: 
    This function reads in the ppi dataset. 
    
    Variables: 
    protein_protein = a dataframe with all ppi's that have a higher combined_score than 0.9.
    """
    protein_protein = pd.read_csv('protein_links_v11.0_0.9.tsv', sep=' ')
    protein_protein['protein'] = protein_protein['protein'].map(lambda x: x.lstrip('9606.'))
    protein_protein['chemical'] = protein_protein['chemical'].map(lambda x: x.lstrip('9606.'))  
    
    return protein_protein

def make_dictio_ppi(protein_protein, get_ensp_filtered):
    """
    Function: 
    This function maps the ppi dataset and the ageing related genes so that it can be enriched later on. 
    
    Variables: 
    dictio = a dictionary with proteins and a list of proteins that interact with this protein. 
    filtered_protein = the column with target proteins from the protein_protein dataset filtered by the ageing related genes. 
    filtered_chemical = the column with the initial protein from the protein_protein dataset filtered by the ageing related 
    genes. 
    total = filtered_protein and filtered_chemical put together into one dataset.
    protein_gene = converts the protein column of the total dataframe to gene Entrez ID's. 
    biomart_data = a dataframe with all the data from biomart. 
    get_ens = merges the biomart dataset and the protein_gene dataset to get the Entrez ID's for the chemicals. 
    chem_prot_gene = a dataframe with Entrez ID's for the proteins. 
    dictio = a dictionary with genes and a list of genes that that gene interacts with. 
    """
    dictio = {}
    filtered_protein = protein_protein[protein_protein['protein'].isin(get_ensp_filtered['protein'])]
    filtered_chemical = protein_protein[protein_protein['chemical'].isin(get_ensp_filtered['protein'])]
    total = pd.concat([filtered_protein, filtered_chemical], ignore_index=True) 
    
    protein_gene = protein_to_entrez(total)
    protein_gene = protein_gene.rename(columns={"Entrez": "Entrez_protein"})
    
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "chemical", "Entrez_chemical", "Uniprot", "name"])

    get_ens = pd.merge(protein_gene, biomart_data, on=["chemical"]) 
    get_ens = get_ens.dropna(subset=['Entrez_chemical'])
    get_ens['Entrez_chemical'] = get_ens['Entrez_chemical'].astype(int)
    chem_prot_gene = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    for i in chem_prot_gene['Entrez_chemical'].unique(): 
        dictio[i] = [chem_prot_gene['Entrez_protein'][j] for j in chem_prot_gene[chem_prot_gene['Entrez_chemical']==i].index] 
    
    for x in chem_prot_gene['Entrez_protein'].unique():
        if x in dictio:
            dictio[x] += ([chem_prot_gene['Entrez_chemical'][y] for y in chem_prot_gene[chem_prot_gene['Entrez_protein']==x].index]) 
        else: 
            dictio[x] = [chem_prot_gene['Entrez_chemical'][y] for y in chem_prot_gene[chem_prot_gene['Entrez_protein']==x].index]
    
    return dictio 

def main():
    """
    Function:
    This function calls all the different functions that make the first enrichment. It then enriches all the 
    gene-lists for every biological term against the target-lists for every drug. The results are put into a 
    dataframe. 
    
    Variables: 
    ensembl = reads the dataset containing the STITCH proteins. 
    get_entr_filtered_ens = converts the protein ID's to Entrez gene ID's. 
    gene_set = reads the dataset with the list of aging related genes (Entrez gene ID's). 
    get_ensp_filtered = the same list as the above but the ID's converted to ensembl protein ID's. 
    protein_protein = calls the function: read_ppis(). 
    KEGG_results = calls the function: KEGG_data().
    GO_BP_results = calls the function: GO_BP_data(). 
    GO_CC_results = calls the function: GO_CC_data(). 
    GO_MF_results = calls the function: GO_MF_data(). 
    reactome_results = calls the function: reactome_data().
    ppi_results = calls the function: make_dictio_ppi() and gives the variables protein_protein and get_ensp_filtered 
    as input. 
    databases = a list with all the results of the first enrichment for each of the biological levels. 
    db_names = a list with all the names of the databases. 
    dictio_drugs = the dictionary with all the drugs and their targets. 
    PS = calls the class: ProteinSet. Uses the databases list and the db_names list as input.
    super_x = list with enrichment results. 
    df = dataframe with enrichemtn results. The best p-value gets chosen. 
    dfObj = all the lists appended to the dataframe. 
    drug_genes_results = calls the function: main_ACR_DT()
    """
    
    ensembl = pd.read_csv('STITCH_proteins.txt')
    get_entr_filtered_ens = protein_to_entrez(ensembl)
    
    gene_set = pd.read_csv("test_list_genes1.0.txt")
    get_ensp_filtered = entrez_to_protein(gene_set)
    protein_protein = read_ppis()
    
    KEGG_results = KEGG_data()
    GO_BP_results = GO_BP_data()
    GO_CC_results = GO_CC_data()
    GO_MF_results = GO_MF_data()
    reactome_results = reactome_data()
    ppi_results = make_dictio_ppi(protein_protein, get_ensp_filtered)
    
    databases = [KEGG_results, GO_BP_results, GO_CC_results, GO_MF_results, reactome_results, ppi_results]
    db_names  = ['KEGG', 'GO_BP', 'GO_CC', 'GO_MF', 'Reactome', 'STRING']
    
    dictio_drugs = make_dictio_DT()
    
    PS = [ProteinSet(db,name) for (db,name) in zip(databases, db_names)]
    
    super_x = []
    for drugs, targets in dictio_drugs.items():
        df = pd.concat([ps.enrich(targets, get_entr_filtered_ens['Entrez']) for ps in PS])
        df['drug'] = drugs
        df = df.sort_values("pvalue").groupby("Database", as_index=False).first() 
        super_x.append(df)
    dfObj = pd.concat(super_x)
    drug_genes_results = main_ACR_DT()
    return dfObj
        
data = main() 

#main_ACR_DT()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [3]:
print(data)

NameError: name 'data' is not defined