

### Goal

Calculate lengths of all CNEs, and output to a file for each species pairwise comparison

### Input



### Output

#### Note: modified from code written by summer student Quinn Hauck

In [1]:
import pandas as pd
from Bio import SeqIO
import gzip
from Bio.Seq import Seq
from Bio import pairwise2
from statistics import mean
import os



In [2]:
#directory that stores CNE output (ex: aaur_vs_adig.out)
paired_cne_dir = '../../cnidaria_final/cnefinder_output/stranded/'
#directory with the filtered CNE coordinates
cne_coord_dir = '../post_parsimony_filtering/'
#matrix with all species on the sides with their most common ancestor as entry in the cells
evolution_matrix_file = 'cnidaria_evol_dist_matrix_no_space.csv'
#output directory
cne_lengths_out_dir = 'cne_lengths_no_enthemonae/'

In [3]:
species = ['spis', 'hsym', 'aaur', 'dgig', 'chem', 'ofav', 
           'aten', 'mvir','hvul', 'adig', 'pdam', 'nvec', 'epal']

In [4]:
evo_matrix = pd.read_csv(evolution_matrix_file, index_col=0)
evo_matrix

Unnamed: 0,aaur,adig,amil,aten,chem,dgig,epal,hsym,hvul,mvir,nvec,ofav,pdam,spis
aaur,-,cnidaria,cnidaria,cnidaria,medusozoa,cnidaria,cnidaria,medusozoa,medusozoa,acraspeda,cnidaria,cnidaria,cnidaria,cnidaria
adig,cnidaria,-,acropora,hexacorallia,cnidaria,anthozoa,hexacorallia,cnidaria,cnidaria,cnidaria,hexacorallia,scleractinia,scleractinia,scleractinia
amil,cnidaria,acropora,-,hexacorallia,cnidaria,anthozoa,hexacorallia,cnidaria,cnidaria,cnidaria,hexacorallia,scleractinia,scleractinia,scleractinia
aten,cnidaria,hexacorallia,hexacorallia,-,cnidaria,anthozoa,enthemonae,cnidaria,cnidaria,cnidaria,actiniaria,hexacorallia,hexacorallia,hexacorallia
chem,medusozoa,cnidaria,cnidaria,cnidaria,-,cnidaria,cnidaria,leptothecata,hydrozoa,medusozoa,cnidaria,cnidaria,cnidaria,cnidaria
dgig,cnidaria,anthozoa,anthozoa,anthozoa,cnidaria,-,anthozoa,cnidaria,cnidaria,cnidaria,anthozoa,anthozoa,anthozoa,anthozoa
epal,cnidaria,hexacorallia,hexacorallia,enthemonae,cnidaria,anthozoa,-,cnidaria,cnidaria,cnidaria,actiniaria,hexacorallia,hexacorallia,hexacorallia
hsym,medusozoa,cnidaria,cnidaria,cnidaria,leptothecata,cnidaria,cnidaria,-,hydrozoa,medusozoa,cnidaria,cnidaria,cnidaria,cnidaria
hvul,medusozoa,cnidaria,cnidaria,cnidaria,hydrozoa,cnidaria,cnidaria,hydrozoa,-,medusozoa,cnidaria,cnidaria,cnidaria,cnidaria
mvir,acraspeda,cnidaria,cnidaria,cnidaria,medusozoa,cnidaria,cnidaria,medusozoa,medusozoa,-,cnidaria,cnidaria,cnidaria,cnidaria


In [5]:
# Using Sally's estimates:
evolution_ages = {
    'cnidaria':570,
    'medusozoa':539,
    'acraspeda':466,
    'hexacorallia':341,
    'anthozoa':497,
    'scleractinia':161,
    'enthemonae': 'unknown',
    'actiniaria':242,
    'leptothecata':417,
    'hydrozoa':500,
    'robusta':108,
    'pocilloporidae':56
}

In [6]:
for species1 in species:
    for species2 in species:
        #dont run on same species and only run once for each set of two species
        if species1 == species2:
            continue
        elif not os.path.exists(paired_cne_dir + species1 + '_vs_' + species2 + '.out'):
            continue
        else:
            print('calculating distance and loading files between ' + species1 + ' and ' + species2)
            #calculate evolutionary distance
            last_known_ancestor = evo_matrix.loc[species1][species2]
            evolutionary_distance = evolution_ages[last_known_ancestor]            
            if evolutionary_distance == 'unknown':
                    print("Ancestor age unknown, skip.")
                    continue
            
            #all cnes found between the two sets with the original lengths
            full_cne_set = pd.read_csv(paired_cne_dir + species1 + '_vs_' + species2 + '.out', sep = '\t') 
            #filtered cnes with coordinates corresponding to merged fasta files used for CNEFinder
            species1_filtered_coords = pd.read_csv(cne_coord_dir + species1 + '_coords_filtered.tsv', sep = '\t', names = ['identifier', 'start', 'end'])
            species2_filtered_coords = pd.read_csv(cne_coord_dir + species2 + '_coords_filtered.tsv', sep = '\t', names = ['identifier', 'start', 'end'])

            #turn it into a dictionary so we can access end coordinates by the start coordinates
            species1_filtered_coords_dict = dict(zip(species1_filtered_coords.start, species1_filtered_coords.end))
            species2_filtered_coords_dict = dict(zip(species2_filtered_coords.start, species2_filtered_coords.end))

            #storage lists
            cne_lengths = []
            evolutionary_distances = []

            print('looping through each cne found between the two species')
            for row in range(len(full_cne_set)):
                #pull coords of CNEs for both species
                sequence_start_species1 = full_cne_set.iloc[row]['ref_start']
                sequence_end_species1 = full_cne_set.iloc[row]['ref_end']
                sequence_start_species2 = full_cne_set.iloc[row]['query_start']
                sequence_end_species2 = full_cne_set.iloc[row]['query_end']

                #this stores whether or not the raw cne made it past the filtering stage
                species1_bool = False
                species2_bool = False

                #test whether the start and end coordinate of our raw sequence is within the filtered coordinates, 
                #we must do this, because actual CNE lengths are from the original outputs, not our filtered outputs
                #if yes, we can set our boolean variables to true, and proceed to length calculation
                for start in species1_filtered_coords_dict:
                    if sequence_start_species1 >= start and sequence_end_species1 <= species1_filtered_coords_dict[start]:
                        species1_bool = True
                        break

                for start in species2_filtered_coords_dict:
                    if sequence_start_species2 >= start and sequence_end_species2 <= species2_filtered_coords_dict[start]:
                        species2_bool = True
                        break

                #if both sequences made it past filtering, we take the lesser length of the two and store it
                if species1_bool and species2_bool:
                    cne_length = min(full_cne_set.iloc[row]['ref_length'], full_cne_set.iloc[row]['query_length'])
                    cne_lengths.append(cne_length)
                    evolutionary_distances.append(evolutionary_distance)

            print('lengths pulled, writing to file')

            #store lengths and evolutionary distances in dataframe and write to a file
            data = {'distance (Mya)': evolutionary_distances, 'length': cne_lengths}
            cne_length_df = pd.DataFrame(data)
            cne_length_df.to_csv(cne_lengths_out_dir + species1 + '_vs_' + species2 + '_cne_lengths.tsv', sep = '\t', index = False)

calculating distance and loading files between spis and aaur
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between hsym and spis
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between hsym and aaur
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between hsym and dgig
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between hsym and chem
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between hsym and ofav
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between hsym and aten
looping through each cne found between the two spec

lengths pulled, writing to file
calculating distance and loading files between adig and dgig
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between adig and chem
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between adig and ofav
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between adig and mvir
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between adig and hvul
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between adig and pdam
looping through each cne found between the two species
lengths pulled, writing to file
calculating distance and loading files between adig and nvec
looping through eac