# Gene Table

In [1]:
import pandas as pd
import mygene
from functools import reduce

In [7]:
# Generate secrecon dataframe
secrecon = pd.read_excel("Input/working_secRecon.xlsx", sheet_name="master_curated_09202021")
secrecon

Unnamed: 0,Index,Flag,Initials,Edited,GENE SYMBOL,ALIAS,ENSEMBL,ENTREZID,GENENAME,PMID,...,Process 10 Ref,Process 10 Genes,Process 11,Process 11 Mean Score,Process 11 Score SD,Process 11 Score Count,Process 11 Raw Scores,Process 11 Curators,Process 11 Ref,Process 11 Genes
0,44.0,,,HM,A3GALT2,"A3GALT2P,IGB3S,IGBS3S,A3GALT2",ENSG00000184389,127550.0,"alpha 1,3-galactosyltransferase 2",186309882187363523378701,...,,,,,,,,,,
1,45.0,,,HM,A4GALT,"A14GALT,A4GALT1,Gb3S,P(k),P1,P1PK,PK,A4GALT",ENSG00000128274,53947.0,"alpha 1,4-galactosyltransferase (P blood group)","10591208,10747952,10748143,10854428,10993874,1...",...,,,,,,,,,,
2,46.0,,,HM,A4GNT,"alpha4GnT,A4GNT",ENSG00000118017,51146.0,"alpha-1,4-N-acetylglucosaminyltransferase","10430883,11304796,12477932,12594234,15489334,1...",...,,,,,,,,,,
3,19.0,,,HM - PD,ABL1,"ABL,BCR-ABL,CHDSKM,JTK7,bcr/abl,c-ABL,c-ABL1,p...",ENSG00000097007,25.0,"ABL proto-oncogene 1, non-receptor tyrosine ki...","10187839,10194451,10212258,10372803,10391249,1...",...,,,,,,,,,,
4,42.0,,,HM,ABL2,"ABLL,ARG,ABL2",ENSG00000143322,27.0,"ABL proto-oncogene 2, non-receptor tyrosine ki...","10521789,10590083,10706884,10892742,11278261,1...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,743.0,,,JT,YOD1,"DUBA8,OTUD2,PRO0907,YOD1",ENSG00000180667,55432.0,YOD1 deubiquitinase,"12477932,14702039,16710414,19322201,19615732,1...",...,,,,,,,,,,
911,,,,JT,USP19,,,,,,...,,,,,,,,,,
912,,,,,,,,,,,...,,,,,,,,,,
913,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# Extract all the unique genes
uGenes = secrecon['GENE SYMBOL']

In [11]:
# Create an instance of the MyGeneInfo class
mg = mygene.MyGeneInfo()

# Find which genes are symbol keys
symKEY = mg.querymany(uGenes, scopes='symbol', fields='symbol', species='human')


INFO:biothings.client:querying 1-915...
INFO:biothings.client:done.
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [15]:
# genes that did not map
diff_genes = set(uGenes) - set([x['query'] for x in symKEY if 'notfound' not in x])
diff_genes

{'ALG1L', 'GALNT19', 'GALNT20', 'MICALCL', 'TMED11', nan}

In [23]:
# check to see if they match ALIAS key
alias_genes = mg.querymany(diff_genes, scopes='alias', fields='alias', species='human')


INFO:biothings.client:querying 1-6...
INFO:biothings.client:done.
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [27]:
# Table to map gene symbols to aliases
IDtype = ['alias', 'ensembl.gene', 'entrezgene', 'name', 'pmid', 'refseq', 'uniprot']
IDtables = {k: pd.DataFrame(mg.querymany(uGenes, scopes='symbol', fields=k, species='human')) for k in IDtype}

INFO:biothings.client:querying 1-915...
INFO:biothings.client:done.
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.
INFO:biothings.client:querying 1-915...
INFO:biothings.client:done.
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.
INFO:biothings.client:querying 1-915...
INFO:biothings.client:done.
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.
INFO:biothings.client:querying 1-915...
INFO:biothings.client:done.
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.
INFO:biothings.client:querying 1-915...
INFO:biothings.client:done.
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of dupl

In [28]:
IDtables

{'alias':        query     _id     _score  \
 0    A3GALT2  127550  18.807713   
 1     A4GALT   53947  18.499306   
 2      A4GNT   51146  18.325245   
 3       ABL1      25  17.682743   
 4       ABL2      27  17.656471   
 ..       ...     ...        ...   
 915     YOD1   55432  17.776705   
 916    USP19   10869  17.654112   
 917      nan     NaN        NaN   
 918      nan     NaN        NaN   
 919      nan     NaN        NaN   
 
                                                  alias notfound  
 0                            [A3GALT2P, IGB3S, IGBS3S]      NaN  
 1         [A14GALT, A4GALT1, Gb3S, P(k), P1, P1PK, PK]      NaN  
 2                                            alpha4GnT      NaN  
 3    [ABL, BCR-ABL, CHDSKM, JTK7, bcr/abl, c-ABL, c...      NaN  
 4                                          [ABLL, ARG]      NaN  
 ..                                                 ...      ...  
 915                            [DUBA8, OTUD2, PRO0907]      NaN  
 916                 

In [29]:
# Add uGenes into the IDtables dictionary
IDtables['geneList'] = pd.DataFrame({"SYMBOL": uGenes})


In [30]:
IDtables

{'alias':        query     _id     _score  \
 0    A3GALT2  127550  18.807713   
 1     A4GALT   53947  18.499306   
 2      A4GNT   51146  18.325245   
 3       ABL1      25  17.682743   
 4       ABL2      27  17.656471   
 ..       ...     ...        ...   
 915     YOD1   55432  17.776705   
 916    USP19   10869  17.654112   
 917      nan     NaN        NaN   
 918      nan     NaN        NaN   
 919      nan     NaN        NaN   
 
                                                  alias notfound  
 0                            [A3GALT2P, IGB3S, IGBS3S]      NaN  
 1         [A14GALT, A4GALT1, Gb3S, P(k), P1, P1PK, PK]      NaN  
 2                                            alpha4GnT      NaN  
 3    [ABL, BCR-ABL, CHDSKM, JTK7, bcr/abl, c-ABL, c...      NaN  
 4                                          [ABLL, ARG]      NaN  
 ..                                                 ...      ...  
 915                            [DUBA8, OTUD2, PRO0907]      NaN  
 916                 

In [32]:
import pyreadr

In [33]:
result = pyreadr.read_r('geneTable.RDS')

In [34]:
result

OrderedDict([(None,
                    SYMBOL                                              ALIAS  \
              0    A3GALT2                      A3GALT2P,IGB3S,IGBS3S,A3GALT2   
              1     A4GALT        A14GALT,A4GALT1,Gb3S,P(k),P1,P1PK,PK,A4GALT   
              2      A4GNT                                    alpha4GnT,A4GNT   
              3       ABL1  ABL,BCR-ABL,CHDSKM,JTK7,bcr/abl,c-ABL,c-ABL1,p...   
              4       ABL2                                      ABLL,ARG,ABL2   
              ..       ...                                                ...   
              904    XYLT1       DBQD2,PXYLT1,XT-I,XT1,XTI,XYLTI,xylT-I,XYLT1   
              905    XYLT2                 PXYLT2,SOS,XT-II,XT2,xylT-II,XYLT2   
              906     YKT6                                               YKT6   
              907     YOD1                           DUBA8,OTUD2,PRO0907,YOD1   
              908    USP19                                       ZMYND9,USP19   
        