In [1]:
import pandas as pd

In [2]:
# coverage data
cov = pd.read_csv('/stor/work/Marcotte/project/rmcox/leca/ppi_ml/results/coverage/og_coverage_by_species_phylo.150p.filtdollo.csv')

# species names & clades
species = pd.read_csv('/stor/work/Marcotte/project/rmcox/leca/ppi_ml/data/meta/speciesinfo_clades.csv')
species = species[(species.tax_group == 'eukaryota')]
species['code'] = [str.lower(i) for i in species['code']]
species = species[['code', 'clade']]
print(species)

# ordered codes based on phylogeny
species_ordered = [line.strip() for line in open('/stor/work/Marcotte/project/rmcox/leca/ppi_ml/data/meta/euk_codes_ordered_phylo.txt', 'r')]

# clade order
clade_order = ['Amorphea','Excavate','TSAR','Archaeplastida']

     code           clade
0   arath  Archaeplastida
1   braol  Archaeplastida
2   brart        Amorphea
3   caeel        Amorphea
5   cansa  Archaeplastida
6   cerri  Archaeplastida
7   cheqi  Archaeplastida
8   chlre  Archaeplastida
9   cocnu  Archaeplastida
11  dicdi        Amorphea
12  drome        Amorphea
14  euggr        Excavate
16  human        Amorphea
17  maize  Archaeplastida
18  mouse        Amorphea
19  nemve        Amorphea
20  orysj  Archaeplastida
21  phatc            TSAR
22    pig        Amorphea
23  plaba            TSAR
24  plaf7            TSAR
25  plakh            TSAR
27  selml  Archaeplastida
28  sollc  Archaeplastida
29  soybn  Archaeplastida
31  strpu        Amorphea
32  tetts            TSAR
33  tryb2        Excavate
34  wheat  Archaeplastida
35  xenla        Amorphea
36  yeast        Amorphea


In [3]:
gb = species.groupby(['clade'])
species_counts = gb.size().to_frame(name='counts')
species_dict = dict()
for clade in species_counts.index:
    species_count = species_counts.loc[clade][0]
    species_dict[clade] = species_count
species_dict

{'Amorphea': 11, 'Archaeplastida': 13, 'Excavate': 2, 'TSAR': 5}

In [4]:
def calc_frac(n, clade, d):
    if n == 0:
        return(0)
    else:
        total = d.get(clade, f'{clade} not found')
        frac = n/total
        return(round(frac, 3))

In [5]:
cov_long = pd.melt(cov, id_vars=['ID'], value_vars=cov.columns[1:])
cov_long = cov_long.rename(columns={'variable':'species','value':'presence'})

In [6]:
cov_clade = cov_long.merge(species, how='left', left_on='species', right_on='code')

In [7]:
counts = cov_clade.groupby(['ID','clade']).sum('presence')
counts = counts.reset_index()
counts['frac'] = [calc_frac(i, j, species_dict) for i, j in zip(counts['presence'], counts['clade'])]
counts = counts.rename(columns={'presence':'n_species_obs'})
counts.sort_values(['ID', 'frac'], ascending=[True,False])

Unnamed: 0,ID,clade,n_species_obs,frac
2,ENOG502QPHT,Excavate,1,0.500
0,ENOG502QPHT,Amorphea,0,0.000
1,ENOG502QPHT,Archaeplastida,0,0.000
3,ENOG502QPHT,TSAR,0,0.000
5,ENOG502QPHW,Archaeplastida,12,0.923
...,...,...,...,...
23947,KOG4849,TSAR,0,0.000
23948,KOG4850,Amorphea,9,0.818
23949,KOG4850,Archaeplastida,0,0.000
23950,KOG4850,Excavate,0,0.000


In [8]:
df = counts.pivot(index="ID", columns="clade", values="frac")
df = df.reset_index()
df.to_csv('../ppi_ml/annotations/leca_nogs_cov_fraction.csv', index=False)