In [1]:
import pandas as pd
import ete3
import re
import community
import networkx as nx
import itertools
import numpy as np
from sklearn import manifold
from scipy.spatial.distance import squareform, pdist
from matplotlib import pyplot as plt
import seaborn as sns
import igraph as ig
import plotly
import chart_studio.plotly as ptl
import plotly.graph_objects as go
import colorlover as cl
from IPython.display import HTML
import multiprocessing
import pickle as pkl
import random
from sklearn import mixture
from collections import Counter
import random
from scipy.stats import mannwhitneyu

ptl.sign_in('lthiberiol', 'm15ikp59lt')
ncbi = ete3.NCBITaxa()

%cd /work/eggNOG/

/work/eggNOG


In [2]:
sampled_genomes = pd.read_csv('/work/kelsey/genomes.tab',
                              sep='\t',
                              index_col=0)

In [3]:
lineages = pd.DataFrame()
for taxid in sampled_genomes.species_taxid.unique():
    if pd.isna(taxid):
        continue
    lineages = lineages.append({tax_rank: tmp_taxid 
                                 for tmp_taxid, tax_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()},
                                ignore_index=True)
lineages = lineages.reindex(columns=['class', 'family',  'genus', 'phylum',
                                     'order', 'species', 'superkingdom']).copy()
lineages = lineages.query('superkingdom == 2').copy()

In [4]:
eggNOG_sample = pd.read_csv('e5.bacteria.taxid_info.tsv',
                            sep='\t',
                            comment='#',
                            names=['Taxid', 'Sci.Name', 'Rank', 'Named Lineage', 'Taxid Lineage'],
                            header=None,
                            index_col=0)

In [5]:
eggNOG_lineage = pd.DataFrame()
for taxid in eggNOG_sample.index.unique():
    if pd.isna(taxid):
        continue
    tmp = pd.Series({tax_rank: tmp_taxid 
                     for tmp_taxid, tax_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()})
    tmp.name = taxid
    eggNOG_lineage = eggNOG_lineage.append(tmp)

eggNOG_lineage = eggNOG_lineage.reindex(columns=['class', 'family',  'genus', 'phylum',
                                                 'order', 'species', 'superkingdom']).copy()
eggNOG_lineage = eggNOG_lineage.query('superkingdom == 2').copy()


taxid 1344012 was translated into 480813


taxid 443255 was translated into 1901


taxid 1525715 was translated into 1545044


taxid 861530 was translated into 29382


taxid 1317118 was translated into 1379903


taxid 67281 was translated into 67351


taxid 1353531 was translated into 1708715


taxid 1288963 was translated into 1232681


taxid 1345697 was translated into 1921421


taxid 1552758 was translated into 1885902


taxid 469595 was translated into 1639133


taxid 469596 was translated into 100884


taxid 1434929 was translated into 1820025


taxid 1104325 was translated into 1158600


taxid 911239 was translated into 122355


taxid 265729 was translated into 246786


taxid 1122931 was translated into 1203610


taxid 1118055 was translated into 33037


taxid 1219084 was translated into 1123384


taxid 667632 was translated into 863227


taxid 1166016 was translated into 1905730


taxid 1408427 was translated into 1094555


taxid 520709 was translated into 1530123


taxid 13362

In [6]:
eggNOG_groups = pd.read_csv('2_members.tsv',
                            sep='\t',
                            header=None,
                            usecols=[1,2,3,4],
                            names=['group_id', 'num_proteins', 'num_taxa', 'members'])

tmp           = eggNOG_groups.members.map(lambda cell: [int(taxid) 
                                                        for taxid in re.findall('(\d+)\.(?:[^,]+)', cell)])
tmp.name      = 'taxa'
eggNOG_groups = eggNOG_groups.join(tmp)

In [7]:
def get_phyla_overlap(taxa):
    #taxids = [int(_) for _ in taxa]
    group_phyla      = set(eggNOG_lineage.loc[taxa, 'phylum'].unique())
    overlapped_phyla = group_phyla.intersection(lineages.phylum.unique())
    return(overlapped_phyla)

eggNOG_target_groups = eggNOG_groups[eggNOG_groups.taxa.map(lambda cell: 
                                                            True if len(get_phyla_overlap(cell)) > 1 
                                                            else False)]

In [8]:
eggNOG_trees = pd.read_csv('2_trees.tsv',
                           sep='\t',
                           header=None,
                           usecols=[1,2,3],
                           index_col=0,
                           names=['group_id', 'fast', 'tree'])
eggNOG_trees = eggNOG_trees.reindex(index=eggNOG_target_groups.group_id)

In [33]:
working_groups = eggNOG_target_groups.query('num_proteins<num_taxa*3')
working_trees  = eggNOG_trees.loc[working_groups.group_id]

In [42]:
working_groups.to_parquet('working_eggNOG_groups.parquet', compression='gzip', engine='fastparquet')
working_trees.to_parquet( 'working_eggNOG_trees.parquet',  compression='gzip', engine='fastparquet')
eggNOG_lineage.to_parquet('eggNOG_taxonomy.parquet', compression='gzip', engine='fastparquet')

In [39]:
working_groups.to_parquet('working_eggNOG_groups.parquet', compression='gzip')
working_trees.to_parquet( 'working_eggNOG_trees.parquet',  compression='gzip')
eggNOG_lineage.to_parquet('eggNOG_taxonomy.parquet', compression='gzip')