In [None]:
import pandas as pd
import numpy as np
from rpy2.robjects.vectors import StrVector
import rpy2.robjects as robjects
import rpy2.robjects.packages as rpackages


def protein_to_entrez(other_set1): 
    """
    Function: 
    This function takes a dataset containing ensembl protein id's and turns them into entrez gene id's. 
    
    Variables: 
    biomart_data = a dataset with entrez gene id's and their corresponding ensembl protein id's. 
    other_set1 = a list of entrez gene id's. 
    get_ens = merged dataset with other_set1 and biomart_data. 
    get_ens_filtered = the same dataset as get_ens but without some of the columns that are not important. 
    """
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "protein", "Entrez", "Uniprot", "name"])

    get_ens = pd.merge(other_set1, biomart_data, on=["protein"]) 
    get_ens = get_ens.dropna(subset=['Entrez'])
    get_ens['Entrez'] = get_ens['Entrez'].astype(int)
    get_ens_filtered = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    return get_ens_filtered


def ppi_interactions(gene_list):
    """
    Function: 
    This function gets the protein-protein interactions from the r-package: STRINGdb. 
    
    Variables: 
    STRINGdb = the imported r-package: STRINGdb.
    interactions = the protein-protein interactions of the genelist that was given. 
    """
    
    STRINGdb = rpackages.importr('STRINGdb')
    robjects.r("string_db <- STRINGdb$new(score_threshold=900)")
    robjects.r("df <- data.frame(Entrez=matrix(unlist(%s), nrow=length(%s), byrow=T))" % (gene_list.r_repr(), gene_list.r_repr()))  
    robjects.r('mapped <- string_db$map(df, "Entrez", removeUnmappedRows = TRUE)')
    interactions = robjects.r("string_db$get_interactions(mapped$STRING_id)") 
    
    return interactions
    
def ppi_dictio(gene_list): 
    """
    Function: 
    This function turns the interaction results into a dictionary with genes. This is the dictionary that is going to
    be used for the enrichment later on. 
    
    Variables: 
    interactions = calls the function: ppi_interactions(), and uses as input: gene_list.
    chemical = makes a list of the first column of the interaction results. 
    df_proteins = makes a dataframe of the second column of the interaction results. 
    dictio = a dictionary with all the proteins and the proteins that they interact with. 
    protein_gene = a dataframe with all the protein id's converted to gene entrez id's. 
    biomart_data = a dataframe with all the biomart data stored into it. 
    get_ens = a merged dataframe with the dataset protein_gene and biomart_data merged. 
    chem_prot_gene = a dataframe with all the protein id's converted to gene id's for the enrichment. 
    """
    
    interactions = ppi_interactions(gene_list)
    chemical = list(interactions[0])
    df_proteins = pd.DataFrame(list(interactions[1]))
    df_proteins.columns = ['protein']
    df_proteins['chemical'] = chemical
    df_proteins['protein'] = df_proteins['protein'].map(lambda x: x.lstrip('9606.'))
    df_proteins['chemical'] = df_proteins['chemical'].map(lambda x: x.lstrip('9606.'))

    dictio = {}

    protein_gene = protein_to_entrez(df_proteins)
    protein_gene = protein_gene.rename(columns={"Entrez": "Entrez_protein"})

    biomart_data = pd.read_csv("biomart.tsv", 
                    sep='\t', 
                    names=["gene", "transcript", "chemical", "Entrez_chemical", "Uniprot", "name"])


    get_ens = pd.merge(protein_gene, biomart_data, on=["chemical"]) 
    get_ens = get_ens.dropna(subset=['Entrez_chemical'])
    get_ens['Entrez_chemical'] = get_ens['Entrez_chemical'].astype(int)
    chem_prot_gene = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)

    for i in chem_prot_gene['Entrez_chemical'].unique(): 
        dictio[i] = [chem_prot_gene['Entrez_protein'][j] for j in chem_prot_gene[chem_prot_gene['Entrez_chemical']==i].index] 

    for x in chem_prot_gene['Entrez_protein'].unique():
        if x in dictio:
            dictio[x] += ([chem_prot_gene['Entrez_chemical'][y] for y in chem_prot_gene[chem_prot_gene['Entrez_protein']==x].index]) 
        else: 
            dictio[x] = [chem_prot_gene['Entrez_chemical'][y] for y in chem_prot_gene[chem_prot_gene['Entrez_protein']==x].index]

    for pro, chem in dictio.items(): 
        dictio[pro] = list(set(chem))

    return dictio 

ACR_genes = robjects.r("scan('/home/mhaan/test_list_genes1.0.txt', what='', sep='\n', skip = 1)")
ppi_dictionary = ppi_dictio(ACR_genes) 
print(ppi_dictionary)