<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Obtain-Ensembl-IDs" data-toc-modified-id="Obtain-Ensembl-IDs-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Obtain Ensembl IDs</a></span></li><li><span><a href="#Read-in-Ensembl-to-Uniprot-mapping-on-perform-conversion" data-toc-modified-id="Read-in-Ensembl-to-Uniprot-mapping-on-perform-conversion-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Read in Ensembl to Uniprot mapping on perform conversion</a></span></li><li><span><a href="#Read-in-Uniref-to-KEGG-mapping" data-toc-modified-id="Read-in-Uniref-to-KEGG-mapping-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Read in Uniref to KEGG mapping</a></span></li><li><span><a href="#Perform-Ensembl-to-KEGG-conversion" data-toc-modified-id="Perform-Ensembl-to-KEGG-conversion-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Perform Ensembl to KEGG conversion</a></span></li></ul></div>

In [1]:
!which python

/mnt/home/jmorton/miniconda3/envs/qiime2-2021.4/bin/python


In [13]:
from Bio import SeqIO
import skbio
import pandas as pd

In [20]:
# from woltka
# https://github.com/qiyunzhu/woltka/blob/master/woltka/file.py
from os import listdir
from os.path import basename, dirname, splitext, isfile, join
from shutil import which
from subprocess import Popen, PIPE
import gzip
import bz2
import lzma


zipfmts = {'.gz':   'gzip', '.gzip':   'gzip',
           '.bz2': 'bzip2', '.bzip2': 'bzip2',
           '.xz':     'xz', '.lz':       'xz', '.lzma': 'xz'}
ziplibs = {'gzip': gzip, 'bzip2': bz2, 'xz': lzma}


def openzip(fp, mode='rt'):
    """Open a regular or compressed file by matching filename extension to
    proper library.
    Parameters
    ----------
    fp : str
        Input filepath.
    mode : str, optional
        Python file mode. Default: "rt" (read as text).
    Returns
    -------
    file handle
        Text stream ready to be read.
    Notes
    -----
    This is a simple and universal solution which uses Python's built-in
    compression modules. It supports reading and writing. However it is not as
    fast as `readzip` in reading compressed files.
    See Also
    --------
    readzip
    """
    ext = splitext(fp)[1]
    zipper = getattr(ziplibs[zipfmts[ext]], 'open') if ext in zipfmts else open
    return zipper(fp, mode)

# Obtain Ensembl IDs

In [34]:
fname = '/mnt/home/jmorton/ceph/sfari/data/recount3/ensembl_ids.txt'
ensembl_ids = pd.read_table(fname, header=None).values.ravel()
ensembl_ids = list(map(lambda x: x.split('.')[0], ensembl_ids))

# Read in Ensembl to Uniprot mapping on perform conversion

In [45]:
ensembl2uniprot = pd.read_table('/mnt/home/jmorton/databases/ensembl/Homo_sapiens.GRCh38.104.uniprot.tsv')
ensembl2uniprot = ensembl2uniprot.set_index('gene_stable_id')

In [47]:
# only look at ensembl ids in uniprot
ensembl_ids = list(set(ensembl_ids) & set(ensembl2uniprot.index))

In [49]:
ensembl2uniprot = ensembl2uniprot.loc[ensembl_ids].reset_index()

In [50]:
ensembl2uniprot

Unnamed: 0,gene_stable_id,transcript_stable_id,protein_stable_id,xref,db_name,info_type,source_identity,xref_identity,linkage_type
0,ENSG00000088888,ENST00000416600,ENSP00000413749,Q7Z434,Uniprot/SWISSPROT,DIRECT,-,-,-
1,ENSG00000088888,ENST00000416600,ENSP00000413749,Q7Z434-4,Uniprot_isoform,DIRECT,-,-,-
2,ENSG00000088888,ENST00000428216,ENSP00000401980,Q7Z434,Uniprot/SWISSPROT,DIRECT,100,100,-
3,ENSG00000088888,ENST00000428216,ENSP00000401980,Q7Z434-1,Uniprot_isoform,DIRECT,-,-,-
4,ENSG00000151929,ENST00000369085,ENSP00000358081,O95817,Uniprot/SWISSPROT,DIRECT,100,100,-
...,...,...,...,...,...,...,...,...,...
129776,ENSG00000167900,ENST00000301634,ENSP00000301634,A0A384MDV9,Uniprot/SPTREMBL,SEQUENCE_MATCH,100,100,-
129777,ENSG00000167900,ENST00000588734,ENSP00000468425,K7ERV3,Uniprot/SPTREMBL,DIRECT,100,100,-
129778,ENSG00000167900,ENST00000590862,ENSP00000468556,K7ES52,Uniprot/SPTREMBL,DIRECT,100,100,-
129779,ENSG00000167900,ENST00000590430,ENSP00000467121,K7ENW5,Uniprot/SPTREMBL,DIRECT,100,100,-


In [28]:
!ls /mnt/home/jmorton/ceph/sfari/data/recount3/ensembl_ids.txt

age_sex_matched_posterior  filter_genes.py	table.biom.qza
diff_abs.sh		   qiime_diff_abs.sh	table_filtered.biom
diff_abs.sh~		   sample_metadata.txt
ensembl_ids.txt		   table.biom


In [31]:
from Bio import SeqIO
fname = '/mnt/home/jmorton/ceph/seq-databases/swissprot/uniprot_sprot.dat'
gen = SeqIO.parse(fname, format='swiss')

In [32]:
def kegg_uniprot_f(y):
    kegg_toks = list(filter(lambda x: 'KEGG' in x, y.dbxrefs))
    swiss_toks = list(filter(lambda x: 'Swiss' in x, y.dbxrefs))
    kegg_toks = list(map(lambda x: x.split('KEGG:')[-1], kegg_toks))
    swiss_toks = list(map(lambda x: x.split(':')[-1], swiss_toks))

    res = []
    for w in swiss_toks:
        for k in kegg_toks:
            res.append((w, k))
    return res

In [33]:
ku = map(kegg_uniprot_f, gen)
ku = sum(ku, [])

In [34]:
import pandas as pd
ku = pd.DataFrame(ku)

In [35]:
ku.to_csv('../results/ensembl')

# Read in Uniref to KEGG mapping

In [73]:
import os, xml.etree.ElementTree as ET
fname = '/mnt/home/jmorton/ceph/seq-databases/swissprot/uniprot_sprot.xml'
tree = ET.parse(fname)
root = tree.getroot()
children = root.getchildren()

def get_uniref_kegg(x):
    if x.find('{http://uniprot.org/uniprot}accession') is None:
        return []
    uniref = x.find('{http://uniprot.org/uniprot}accession').text
    kos = []
    for y in x.getchildren():
        lookup = dict(y.items())
        if 'type' in lookup and 'KEGG' in lookup['type']:
            kos.append((uniref, lookup['id']))
    return kos

uniref2ko = list(map(get_uniref_kegg, children))

keggfile = '/mnt/home/jmorton/ceph/seq-databases/swissprot/uniprot2kegg.csv'
uniref2ko = sum(uniref2ko, [])
df = pd.DataFrame(uniref2ko)
df.to_csv(keggfile, header=None, index=False, sep='\t')

In [178]:
uniref2ko = df
uniref2ko.columns = ['Uniref', 'KO']

# Perform Ensembl to KEGG conversion

In [180]:
ensembl2ko = pd.merge(ensembl2uniprot, uniref2ko, left_on='xref', right_on='Uniref')

In [182]:
fname = '/mnt/home/jmorton/ceph/sfari/data/recount3/ensembl2kegg.txt'
ensembl2ko.to_csv(fname, sep='\t')