### Goal

Count number of CNEs betweeen every species pair and combine with evolutionary distance data

### Input

- 'pairwise_links.json': list of all one to one links between CNEs (for example: [sp1_cne_1, sp2_cne_13], [sp1_cne_2, sp4_cne_129], ...]) generated with merge_homologous_cnes.py
- node_file: matrix of name of node for each pair of species
- filtered_clusters.csv: file of CNE clusters post-convergent filtering (to retrieve CNEs that survived all filtering steps).

### Output

- cne_count_vs_dist_paired.tsv: dataframe of CNE counts VS evolutionary distance (pre-filtering)
- cne_count_vs_dist_paired_filtered.tsv: idem but post-filtering

In [1]:
import json
import pandas as pd
import glob
import csv
import itertools
from collections import defaultdict

In [2]:
pairwise_file = 'pairwise_links.json'

In [3]:
node_file = '../plot_evol_dist/cnidaria_evol_dist_matrix_no_space.csv'

In [4]:
cluster_file = '../post_parsimony_filtering/filtered_clusters.csv'

### Read pairwise links file

In [5]:
with open(pairwise_file) as json_file:
    pairwise_links = json.load(json_file)

In [6]:
type(pairwise_links)

list

In [7]:
len(pairwise_links)

542510

### Read node matrix

In [8]:
nodes_df = pd.read_csv(node_file, index_col=0)
nodes_df

Unnamed: 0,aaur,adig,amil,aten,chem,dgig,epal,hsym,hvul,mvir,nvec,ofav,pdam,spis
aaur,-,cnidaria,cnidaria,cnidaria,medusozoa,cnidaria,cnidaria,medusozoa,medusozoa,acraspeda,cnidaria,cnidaria,cnidaria,cnidaria
adig,cnidaria,-,acropora,hexacorallia,cnidaria,anthozoa,hexacorallia,cnidaria,cnidaria,cnidaria,hexacorallia,scleractinia,scleractinia,scleractinia
amil,cnidaria,acropora,-,hexacorallia,cnidaria,anthozoa,hexacorallia,cnidaria,cnidaria,cnidaria,hexacorallia,scleractinia,scleractinia,scleractinia
aten,cnidaria,hexacorallia,hexacorallia,-,cnidaria,anthozoa,enthemonae,cnidaria,cnidaria,cnidaria,actiniaria,hexacorallia,hexacorallia,hexacorallia
chem,medusozoa,cnidaria,cnidaria,cnidaria,-,cnidaria,cnidaria,leptothecata,hydrozoa,medusozoa,cnidaria,cnidaria,cnidaria,cnidaria
dgig,cnidaria,anthozoa,anthozoa,anthozoa,cnidaria,-,anthozoa,cnidaria,cnidaria,cnidaria,anthozoa,anthozoa,anthozoa,anthozoa
epal,cnidaria,hexacorallia,hexacorallia,enthemonae,cnidaria,anthozoa,-,cnidaria,cnidaria,cnidaria,actiniaria,hexacorallia,hexacorallia,hexacorallia
hsym,medusozoa,cnidaria,cnidaria,cnidaria,leptothecata,cnidaria,cnidaria,-,hydrozoa,medusozoa,cnidaria,cnidaria,cnidaria,cnidaria
hvul,medusozoa,cnidaria,cnidaria,cnidaria,hydrozoa,cnidaria,cnidaria,hydrozoa,-,medusozoa,cnidaria,cnidaria,cnidaria,cnidaria
mvir,acraspeda,cnidaria,cnidaria,cnidaria,medusozoa,cnidaria,cnidaria,medusozoa,medusozoa,-,cnidaria,cnidaria,cnidaria,cnidaria


In [9]:
# Using estimates from Hydractinia genome paper
div_times = {
    'cnidaria':570,
    'medusozoa':539,
    'acraspeda':466,
    'hexacorallia':341,
    'anthozoa':497,
    'scleractinia':161,
    'enthemonae': 'unknown', # no good reference found
    'actiniaria':242,
    'leptothecata':417,
    'hydrozoa':500,
    'robusta':108,
    'pocilloporidae':56
}

In [10]:
sp_list = list(nodes_df.index)
sp_list.remove('amil') # Previous version included another acropora species acropora millepora
sp_combs = list(itertools.combinations(set(sp_list), 2))

In [11]:
link_counts = defaultdict(lambda:0)
for link in pairwise_links:
    #print(link)
    cne1, cne2 = link[0], link[1]
    sp1, sp2 = cne1[:4], cne2[:4]
    sp_tup = (sp1, sp2)
    if (sp2, sp1) in link_counts:
        print('inverse tup: ', (sp2, sp1), 'is already here' )
        link_counts[(sp2, sp1)] +=1
    else:
        link_counts[sp_tup] +=1
# Add 0 for species pairs not seen:
for comb in sp_combs:
    sp1, sp2 = comb[0], comb[1]
#    if (comb in link_counts) and (sp2, sp1):
    if comb not in link_counts and (sp2, sp1) not in link_counts:
        link_counts[comb] = 0
print(len(link_counts))
print(len(sp_combs))

78
78


In [12]:
link_counts

defaultdict(<function __main__.<lambda>()>,
            {('adig', 'pdam'): 30448,
             ('dgig', 'pdam'): 1019,
             ('hsym', 'spis'): 488,
             ('mvir', 'hvul'): 851,
             ('mvir', 'ofav'): 212,
             ('epal', 'chem'): 182,
             ('nvec', 'dgig'): 867,
             ('hsym', 'nvec'): 174,
             ('dgig', 'chem'): 269,
             ('hvul', 'epal'): 161,
             ('nvec', 'pdam'): 724,
             ('aten', 'pdam'): 1578,
             ('ofav', 'dgig'): 1147,
             ('hsym', 'chem'): 12045,
             ('aten', 'aaur'): 305,
             ('mvir', 'pdam'): 153,
             ('mvir', 'epal'): 153,
             ('chem', 'aaur'): 186,
             ('hsym', 'mvir'): 14469,
             ('nvec', 'chem'): 72,
             ('adig', 'ofav'): 83234,
             ('aten', 'epal'): 10130,
             ('adig', 'aaur'): 187,
             ('aten', 'adig'): 4336,
             ('ofav', 'epal'): 1558,
             ('epal', 'spis'): 3083,
     

In [13]:
dist_df = pd.DataFrame(columns=['sp1', 'sp2', 'node', 'distance', 'cne_count'])
for sp_tup, count in link_counts.items():
    sp1, sp2 = sp_tup[0], sp_tup[1]
    lca = nodes_df.loc[sp1, sp2]
    evol_dist = div_times[lca]
    if evol_dist != 'unknown':
        new_row = [sp1, sp2, lca, evol_dist, count]
        dist_df.loc[len(dist_df)] = new_row

In [14]:
dist_df

Unnamed: 0,sp1,sp2,node,distance,cne_count
0,adig,pdam,scleractinia,161,30448
1,dgig,pdam,anthozoa,497,1019
2,hsym,spis,cnidaria,570,488
3,mvir,hvul,medusozoa,539,851
4,mvir,ofav,cnidaria,570,212
...,...,...,...,...,...
72,hvul,aaur,medusozoa,539,581
73,mvir,spis,cnidaria,570,320
74,adig,mvir,cnidaria,570,261
75,pdam,chem,cnidaria,570,119


In [15]:
sum(dist_df['cne_count'])

532380

In [16]:
dist_df.to_csv("cne_count_vs_dist_paired.tsv", sep="\t", index=False)

### Same, but only keep CNEs post-convergent filtering step

In [17]:
filtered_cnes = []
with open (cluster_file,'r') as csv_file:
    reader =csv.reader(csv_file)
    for row in reader:
        cluster_id=row[0]
        cnes = row[1:]
        filtered_cnes = filtered_cnes + cnes
filtered_cnes

['adig_cne_21',
 'adig_cne_26279',
 'adig_cne_57',
 'aten_cne_3077',
 'aten_cne_4904',
 'aten_cne_6242',
 'aten_cne_6390',
 'aten_cne_7490',
 'aten_cne_7492',
 'aten_cne_7627',
 'aten_cne_7631',
 'aten_cne_8798',
 'dgig_cne_4494',
 'dgig_cne_4495',
 'dgig_cne_5400',
 'dgig_cne_5401',
 'dgig_cne_6050',
 'epal_cne_2173',
 'epal_cne_2174',
 'epal_cne_303',
 'epal_cne_6751',
 'ofav_cne_24625',
 'ofav_cne_24626',
 'ofav_cne_24627',
 'ofav_cne_24628',
 'ofav_cne_24629',
 'ofav_cne_24630',
 'ofav_cne_24631',
 'ofav_cne_24644',
 'ofav_cne_24645',
 'ofav_cne_47939',
 'pdam_cne_18340',
 'pdam_cne_28954',
 'pdam_cne_62276',
 'pdam_cne_62277',
 'pdam_cne_62278',
 'pdam_cne_62280',
 'pdam_cne_62281',
 'spis_cne_13146',
 'spis_cne_136216',
 'spis_cne_136217',
 'spis_cne_136218',
 'spis_cne_154140',
 'spis_cne_18236',
 'spis_cne_2361',
 'spis_cne_63969',
 'spis_cne_85117',
 'spis_cne_85120',
 'spis_cne_85121',
 'spis_cne_85124',
 'spis_cne_85126',
 'spis_cne_85127',
 'adig_cne_13312',
 'adig_cne_1700

In [18]:
link_counts = defaultdict(lambda:0)
for link in pairwise_links:
    #print(link)
    cne1, cne2 = link[0], link[1]
    if cne1 in filtered_cnes and cne2 in filtered_cnes:
        sp1, sp2 = cne1[:4], cne2[:4]
        sp_tup = (sp1, sp2)
        if (sp2, sp1) in link_counts:
            print('inverse tup: ', (sp2, sp1), 'is already here' )
            link_counts[(sp2, sp1)] +=1
        else:
            link_counts[sp_tup] +=1
# Add 0 for species pairs not seen:
for comb in sp_combs:
    sp1, sp2 = comb[0], comb[1]
#    if (comb in link_counts) and (sp2, sp1):
    if comb not in link_counts and (sp2, sp1) not in link_counts:
        link_counts[comb] = 0
print(len(link_counts))
print(len(sp_combs))

78
78


In [19]:
dist_df = pd.DataFrame(columns=['sp1', 'sp2', 'node', 'distance', 'cne_count'])
for sp_tup, count in link_counts.items():
    sp1, sp2 = sp_tup[0], sp_tup[1]
    lca = nodes_df.loc[sp1, sp2]
    evol_dist = div_times[lca]
    if evol_dist != 'unknown':
        new_row = [sp1, sp2, lca, evol_dist, count]
        dist_df.loc[len(dist_df)] = new_row

In [20]:
dist_df

Unnamed: 0,sp1,sp2,node,distance,cne_count
0,adig,pdam,scleractinia,161,26348
1,dgig,pdam,anthozoa,497,941
2,hsym,spis,cnidaria,570,357
3,mvir,hvul,medusozoa,539,123
4,mvir,ofav,cnidaria,570,54
...,...,...,...,...,...
72,hvul,aaur,medusozoa,539,112
73,mvir,spis,cnidaria,570,136
74,adig,mvir,cnidaria,570,120
75,pdam,chem,cnidaria,570,101


In [21]:
sum(dist_df['cne_count'])

492293

In [22]:
dist_df.to_csv("cne_count_vs_dist_paired_filtered.tsv", sep="\t", index=False)