In [36]:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas  as pd
import numpy   as np
import os
import re
import itertools
from scipy.spatial.distance import squareform
from collections import Counter

%cd /work/clusterEvo/new_tests/eggNOG/

/work/clusterEvo/new_tests/eggNOG


In [85]:
eggNOG_groups = pd.read_csv('2157_members.tsv',
                            sep='\t',
                            header=None,
                            usecols=[1,2,3,4],
                            names=['group_id', 'num_proteins', 'num_taxa', 'members'])

tmp           = eggNOG_groups.members.map(lambda cell: [int(taxid) 
                                                        for taxid in re.findall('(\d+)\.(?:[^,]+)', cell)])
tmp.name      = 'taxa'
eggNOG_groups = eggNOG_groups.join(tmp)

In [86]:
single_copy = eggNOG_groups.query('(num_proteins==num_taxa) &'
                                  '(num_taxa >= 100)')
single_copy.to_csv('single_copy_genes.tsv', sep='\t')
single_copy.head()

Unnamed: 0,group_id,num_proteins,num_taxa,members,taxa
1277,arCOG00079,112,112,"1041930.Mtc_1976,1054217.TALC_01539,1070774.J0...","[1041930, 1054217, 1070774, 1071085, 1094980, ..."
1279,arCOG00081,166,166,"1006006.Mcup_0318,1033806.HTIA_0777,1041930.Mt...","[1006006, 1033806, 1041930, 1042877, 1054217, ..."
1519,arCOG00354,112,112,"1033806.HTIA_1426,1041930.Mtc_0671,1042877.GQS...","[1033806, 1041930, 1042877, 1070774, 1071085, ..."
1520,arCOG00355,106,106,"1033806.HTIA_0952,1041930.Mtc_0275,1042877.GQS...","[1033806, 1041930, 1042877, 1070774, 1071085, ..."
1561,arCOG00410,168,168,"1006006.Mcup_0164,1033806.HTIA_1385,1041930.Mt...","[1006006, 1033806, 1041930, 1042877, 1054217, ..."


In [89]:
taxa = []
for group_taxa in single_copy.taxa:
    taxa.extend(group_taxa)

In [90]:
sorted(Counter(taxa).items(), key=lambda item: item[1], reverse=True)

[(1227499, 91),
 (348780, 91),
 (358396, 91),
 (797210, 91),
 (797304, 91),
 (1114856, 90),
 (1227500, 90),
 (1230457, 90),
 (1230460, 90),
 (1293047, 90),
 (1293048, 90),
 (469382, 90),
 (543526, 90),
 (662479, 90),
 (797299, 90),
 (797303, 90),
 (1041930, 89),
 (1071085, 89),
 (1121945, 89),
 (1202768, 89),
 (1227453, 89),
 (1227487, 89),
 (1227488, 89),
 (1227495, 89),
 (1227497, 89),
 (222984, 89),
 (29540, 89),
 (304371, 89),
 (406552, 89),
 (485914, 89),
 (519442, 89),
 (523841, 89),
 (547559, 89),
 (634497, 89),
 (644281, 89),
 (694430, 89),
 (1227457, 88),
 (1333523, 88),
 (1457250, 88),
 (268739, 88),
 (309800, 88),
 (413816, 88),
 (521011, 88),
 (523845, 88),
 (573063, 88),
 (573064, 88),
 (589924, 88),
 (797114, 88),
 (797302, 88),
 (1197130, 87),
 (1261545, 87),
 (1324957, 87),
 (267377, 87),
 (323259, 87),
 (362976, 87),
 (402880, 87),
 (416348, 87),
 (419665, 87),
 (426368, 87),
 (444158, 87),
 (579137, 87),
 (593750, 87),
 (647113, 87),
 (693661, 87),
 (751944, 87),
 (79

In [26]:
eggNOG_trees = pd.read_csv('2157_trees.tsv',
                           sep      ='\t',
                           header   =None,
                           usecols  =[1,2,3],
                           index_col=0,
                           names    =['group_id', 'fast', 'tree'])
eggNOG_trees = eggNOG_trees.reindex(index=single_copy.group_id)

In [27]:
if not os.path.isdir('trees/'):
    os.mkdir('trees/')
    
for index, row in eggNOG_trees.iterrows():
    with open(f'trees/{index}.nwk', 'w') as out:
        out.write(re.sub('([(),]\d+)\.[^:]+', r'\1', row.tree))

In [29]:
if not os.path.isdir('alignments/'):
    os.mkdir('alignments/')

for group_id in eggNOG_trees.index:
    with open(f'alignments/{group_id}.fas', 'w') as out:
        for taxon in eggNOG_groups.query(f'group_id=="{group_id}"')['taxa'].squeeze():
            out.write(f'>{taxon}\n-\n')