In [1]:
import pandas as pd
import json
# import numpy as np
from os import listdir
from ete3 import Tree


In [41]:
#### get list of files for relax folder ####
path="/home/maeperez/scratch/Clams/hyphy2.5/absrel"
path_out="/home/maeperez/projects/def-bacc/maeperez/Clams/VesicSymb_Evolution/dowstream_analyses_and_data/"

genes=[]
for file in listdir(path):
    if file[-5:]==".json":
        genes+=[str(file.split('.')[0])]

genes=sorted(genes)

pval_threshold=0.05

#### init all dictionaries and functions ####
branches_dic={'Bathy_MS2': 'Bathy',
 'SUP05_SP6': 'SUP05',
 'R_fausta_': 'R.fausta',
 'R_magnifi': 'R.magnifica',
 'R_pacific': 'R.pacifica',
 'R_phaseol': 'R.phaseoliformis',
 'R_pliocar': 'R.pliocardia',
 'R_rectima': 'R.rectimargo',
 'R_southwa': 'R.southwardae',
 'V_diagona': 'V.diagonalis',
 'V_extenta': 'V.extenta',
 'V_gigas1_': 'V.gigas1',
 'V_gigas2_': 'V.gigas2',
 'V_marissi': 'V.marissinica',
 'V_okutani': 'V.okutanii',
 'V_soyoae1': 'V.soyoae1',
 'V_soyoae2': 'V.soyoae2'}

fl=['Bathy_MS2', 'SUP05_SP6']
ruthia=['R_fausta_', 'R_magnifi', 'R_pacific', 'R_phaseol', 'R_pliocar', 'R_rectima', 'R_southwa']
gigas=['V_diagona', 'V_extenta', 'V_gigas1_', 'V_gigas2_', 'V_marissi', 'V_okutani', 'V_soyoae1', 'V_soyoae2']
partitions={'a':[set(fl),set(branches_dic.keys())-set(fl)],
    'b':[set(ruthia),set(branches_dic.keys())-set(ruthia)],
    'c':[set(gigas),set(branches_dic.keys())-set(gigas)],
    'd':[set(['V_okutani', 'V_soyoae1', 'V_soyoae2']),set(branches_dic.keys())-set(['V_okutani', 'V_soyoae1', 'V_soyoae2'])],
    'e':[set(['V_diagona', 'V_extenta', 'V_gigas1_', 'V_gigas2_']),set(branches_dic.keys())-set(['V_diagona', 'V_extenta', 'V_gigas1_', 'V_gigas2_'])],
    'f':[set(['V_diagona', 'V_extenta']),set(branches_dic.keys())-set(['V_diagona', 'V_extenta'])],
    'g':[set(['R_fausta_', 'R_pacific', 'R_phaseol', 'R_pliocar', 'R_rectima', 'R_southwa']),set(branches_dic.keys())-set(['R_fausta_', 'R_pacific', 'R_phaseol', 'R_pliocar', 'R_rectima', 'R_southwa'])],
    'h':[set(['R_fausta_', 'R_pacific', 'R_phaseol', 'R_rectima']),set(branches_dic.keys())-set(['R_fausta_', 'R_pacific', 'R_phaseol', 'R_rectima'])],
    'i':[set(['R_fausta_', 'R_pacific', 'R_rectima']),set(branches_dic.keys())-set(['R_fausta_', 'R_pacific', 'R_rectima'])],
    'j':[set(['R_pliocar', 'R_southwa']),set(branches_dic.keys())-set(['R_pliocar', 'R_southwa'])],
    'k':[set(['V_soyoae1', 'V_soyoae2']),set(branches_dic.keys())-set(['V_soyoae1', 'V_soyoae2'])],
    'l':[set(['V_gigas1_', 'V_gigas2_']),set(branches_dic.keys())-set(['V_gigas1_', 'V_gigas2_'])],
    'm':[set(['V_diagona', 'V_extenta', 'V_gigas1_', 'V_gigas2_', 'V_okutani', 'V_soyoae1', 'V_soyoae2']),set(branches_dic.keys())-set(['V_diagona', 'V_extenta', 'V_gigas1_', 'V_gigas2_', 'V_okutani', 'V_soyoae1', 'V_soyoae2'])],
    'n':[set(['R_fausta_', 'R_rectima']),set(branches_dic.keys())-set(['R_fausta_', 'R_rectima'])]}

def find_node_partition(br,tree_str):
    t = Tree(tree_str+';', format=1) # load ete3 trees from tree string
    sup05=[l for l in t.get_leaf_names() if 'SUP05' in l][0]
    bathy=[l for l in t.get_leaf_names() if 'MS2017' in l][0]
    ancestor = t.get_common_ancestor(sup05,bathy)
    t.set_outgroup(ancestor) # set ancestor of Sup05 & Bathy as root
    node=t&br # go to node in interest
    leaves=set([l[:9] for l in node.get_leaf_names()]) # get all leaves for that node    
    category='NA'
    for k,v in partitions.items(): # find category associated to that leaves set
        if leaves in v:
            category=k
            break
    if category=='NA': # in tree does not follow main, topology, try to assess in selection is within broader groups
#         print(br, leaves)
#         print(set(branches_dic.keys())-leaves)
        if len(set(leaves)-set(ruthia))==0:
            category='Ruthia'
        elif len(set(leaves)-set(gigas))==0:
            category='Gigas'
        elif len(set(leaves)-set(gigas+ruthia))==0:
            category='Symbionts'
#     print(category)
#     print(t.get_ascii(show_internal=True))
    return(category)

df=pd.DataFrame([],columns=['gene_id']+list(partitions.keys())+list(branches_dic.values())+['Ruthia','Gigas','Symbionts','NA'])

#####################

branches=[]
for gene in genes:
#     print(gene)
    remote_file = open(path+'/'+gene+'.absrel.json')
    absrel_json = json.load(remote_file)
    if pval_threshold==0.05:
        if absrel_json['test results']["positive test results"]==0:
            continue
    new_row={'gene_id':gene}
    for branch in absrel_json['branch attributes']['0'].keys():
        branches+=[branch]
        branch_pval=absrel_json['branch attributes']['0'][branch]['Corrected P-value']
        if branch_pval<pval_threshold:
            br=branch[:9]
#             print(br)
            if br[:4]=='Node':
                tree_str=absrel_json['input']['trees']['0']
                cat=find_node_partition(br,tree_str)
                new_row[cat]=1
            else:
                cat=branches_dic[br]
                new_row[cat]=1
    if len(new_row.keys())>1:
        df=df.append(new_row,ignore_index=True)

####### Edit table to add different combinations of categories

df['All']=df.sum(axis=1)
df['All non terminal branches']=df[['Gigas','Ruthia','Symbionts','NA','a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
                                   'k', 'l','m', 'n']].sum(axis=1) # selection on branches including these that are not part of main tree
df['Free living']=df[['Bathy','SUP05']].sum(axis=1)
df['Gigas']+=df[['d', 'e', 'f', 'k', 'l','m','V.diagonalis', 'V.extenta',
               'V.gigas1', 'V.gigas2', 'V.marissinica','V.okutanii', 'V.soyoae1',
               'V.soyoae2']].sum(axis=1)
df['Ruthia']+=df[['g', 'h', 'i', 'j', 'n', 'R.fausta', 'R.magnifica', 'R.pacifica',
       'R.phaseoliformis', 'R.pliocardia', 'R.rectimargo', 'R.southwardae']].sum(axis=1)
df['Symbionts']+=df[['Gigas','Ruthia','b','c']].sum(axis=1)
df['All groups']=df[['Gigas','Ruthia','Free living']].sum(axis=1)
df=df[['gene_id','All','All groups','Symbionts','Gigas','Ruthia','Free living','All non terminal branches',
      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l','m', 'n', 
      'Bathy', 'SUP05', 'R.fausta', 'R.magnifica', 'R.pacifica',
       'R.phaseoliformis', 'R.pliocardia', 'R.rectimargo', 'R.southwardae',
       'V.diagonalis', 'V.extenta', 'V.gigas1', 'V.gigas2', 'V.marissinica',
       'V.okutanii', 'V.soyoae1', 'V.soyoae2']]
df=df.fillna(0)

df.to_csv(path_out+'absrel_results_pval_'+str(pval_threshold)+'.txt',header=True,index=False,sep='\t')

In [36]:
df

Unnamed: 0,gene_id,All,All groups,Symbionts,Gigas,Ruthia,Free living,All non terminal branches,a,b,...,R.rectimargo,R.southwardae,V.diagonalis,V.extenta,V.gigas1,V.gigas2,V.marissinica,V.okutanii,V.soyoae1,V.soyoae2
0,Rmag_0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,Rmag_0006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Rmag_0010,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rmag_0015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Rmag_0018,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,Rmag_1025,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1,...,0,0,0,0,0,0,0,0,0,0
181,Rmag_1032,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
182,Rmag_1044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,1,0,0,0,1,0
183,Rmag_1055,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
