In [37]:
import pandas as pd
import glob
import seaborn as sns
%matplotlib inline
from collections import defaultdict

#### BLASTn results

In [5]:
blastn_dir = "../../blast_metazoa/blastn_output_files/"

In [6]:
blastn_files = glob.glob(blastn_dir + "*blastn")
blastn_files

['../../blast_metazoa/blastn_output_files/mollusca_vs_nematoda.blastn',
 '../../blast_metazoa/blastn_output_files/cnidaria_vs_priapulida.blastn',
 '../../blast_metazoa/blastn_output_files/cnidaria_vs_arthropoda.blastn',
 '../../blast_metazoa/blastn_output_files/arthropoda_vs_annelida.blastn',
 '../../blast_metazoa/blastn_output_files/cnidaria_vs_mollusca.blastn',
 '../../blast_metazoa/blastn_output_files/arthropoda_vs_nemertea.blastn',
 '../../blast_metazoa/blastn_output_files/arthropoda_vs_hemichordata.blastn',
 '../../blast_metazoa/blastn_output_files/arthropoda_vs_rotifera.blastn',
 '../../blast_metazoa/blastn_output_files/arthropoda_vs_orthonectida.blastn',
 '../../blast_metazoa/blastn_output_files/mollusca_vs_platyhelminthes.blastn',
 '../../blast_metazoa/blastn_output_files/mollusca_vs_chordata.blastn',
 '../../blast_metazoa/blastn_output_files/mollusca_vs_tardigrada.blastn',
 '../../blast_metazoa/blastn_output_files/cnidaria_vs_onychophora.blastn',
 '../../blast_metazoa/blastn_o

In [35]:
coord_dirs = ['../retrieve_original_coords/filtered_coords/',
              '../../arthropoda_06_22/retrieve_original_coords/filtered_coords/',
              '../../mollusca_06_22/retrieve_original_coords/filtered_coords/']

In [68]:
phyla = ['cnidaria', 'arthropoda', 'mollusca']
cne_count_df = pd.DataFrame(columns=['phylum', 'species', 'cne_count'])
for i in range(3):
    sp_count_dict = {}
    phylum = phyla[i]
    print(phylum)
    coord_dir = coord_dirs[i]
    coord_files = glob.glob(coord_dir + '*orig_coords.tsv')
    #print(coord_files)
    for coord_file in coord_files:
        species = coord_file.split("/")[-1].split("_")[0]
        #print(species)
        coord_df = pd.read_csv(coord_file, sep="\t")
        num_cne = len(coord_df)
        sp_count_dict[species] = num_cne
    sp_count_df = pd.DataFrame(sp_count_dict.items(), columns=['species', 'cne_count'])
    sp_count_df['phylum'] = phylum
    cne_count_df = pd.concat([cne_count_df, sp_count_df], axis=0)

cnidaria
arthropoda
mollusca


In [69]:
cne_count_df

Unnamed: 0,phylum,species,cne_count
0,cnidaria,spis,110941
1,cnidaria,hsym,6532
2,cnidaria,aaur,2049
3,cnidaria,dgig,3644
4,cnidaria,chem,2352
5,cnidaria,ofav,43953
6,cnidaria,aten,5576
7,cnidaria,mvir,2730
8,cnidaria,hvul,2540
9,cnidaria,adig,50811


In [57]:
sp_count_dict

{'aaeg': 877,
 'aamp': 1371,
 'aaur': 2049,
 'acal': 4553,
 'adig': 50811,
 'agra': 1633,
 'ahyp': 1078,
 'alic': 1999,
 'amel': 19700,
 'anas': 4661,
 'apis': 806,
 'aten': 5576,
 'bgla': 3217,
 'cdip': 93,
 'cfel': 4575,
 'cgig': 24676,
 'chem': 2352,
 'cscu': 18140,
 'csec': 2383,
 'cvir': 32653,
 'dgig': 3644,
 'dmel': 101,
 'eaff': 1289,
 'echl': 3559,
 'epal': 5313,
 'gocc': 106,
 'hazt': 1497,
 'hruf': 3468,
 'hsym': 6532,
 'hvul': 2540,
 'lgig': 1387,
 'lpol': 1173,
 'mgal': 2951,
 'mvir': 2730,
 'myes': 96983,
 'nvec': 3824,
 'obim': 40836,
 'obir': 19911,
 'ofav': 43953,
 'pcan': 1164,
 'pdam': 85198,
 'phum': 194,
 'pmax': 91747,
 'ptri': 8881,
 'pvan': 7275,
 'smar': 1557,
 'smim': 25687,
 'spha': 66624,
 'spis': 110941,
 'tcas': 534,
 'tpal': 421}

#### How many CNEs are annotated as non-coding RNAs?

In [90]:
def annotate_rnas(blastn_file):
    file_df = pd.DataFrame()
    blastn_df = pd.read_csv(blastn_file, sep="\t",
                       names = ['cne_id', 'description', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen',
                        'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
    all_cne_ids = list(set(blastn_df['cne_id']))
    print(len(all_cne_ids))
    all_rna_cnes = list(set(blastn_df[blastn_df['description'].str.contains('RNA')]['cne_id']))
    snRNA_cnes = list(set(blastn_df[blastn_df['description'].str.contains('snRNA')]['cne_id']))
    ncRNA_cnes = list(set(blastn_df[blastn_df['description'].str.contains('ncRNA')]['cne_id']))
    #non_RNA_cnes = list(set(blastn_df[~blastn_df['description'].str.contains('RNA')]['cne_id']))
    all_cne_df = pd.DataFrame({'cne_id': all_cne_ids})
    all_rna_cne_df = pd.DataFrame({'cne_id': all_rna_cnes, 'RNA':'yes'})
    snRNA_cne_df = pd.DataFrame({'cne_id': snRNA_cnes, 'snRNA':'yes'})
    ncRNA_cne_df = pd.DataFrame({'cne_id': ncRNA_cnes, 'ncRNA':'yes'})
    file_df = all_cne_df.merge(all_rna_cne_df, how='left', ).merge(snRNA_cne_df, how='left')\
                .merge(ncRNA_cne_df, how='left').fillna('no')
    return(file_df)

In [94]:
moll_chord = annotate_rnas( '../../blast_metazoa/blastn_output_files/mollusca_vs_chordata.blastn')

13270


In [95]:
moll_chord

Unnamed: 0,cne_id,RNA,snRNA,ncRNA
0,spha_cne_33204,yes,no,no
1,spha_cne_80016,no,no,no
2,pmax_cne_89217,yes,no,no
3,spha_cne_51652,yes,no,no
4,pmax_cne_17241,no,no,no
...,...,...,...,...
13265,acal_cne_6903,no,no,no
13266,spha_cne_84942,no,no,no
13267,spha_cne_54755,no,no,no
13268,pcan_cne_4150,yes,no,no


In [96]:
moll_chord[moll_chord['RNA'] == 'yes']

Unnamed: 0,cne_id,RNA,snRNA,ncRNA
0,spha_cne_33204,yes,no,no
2,pmax_cne_89217,yes,no,no
3,spha_cne_51652,yes,no,no
6,spha_cne_66343,yes,no,yes
9,obim_cne_6821,yes,no,yes
...,...,...,...,...
13257,spha_cne_26123,yes,no,yes
13259,obim_cne_17029,yes,no,yes
13261,obim_cne_52837,yes,no,yes
13264,spha_cne_46828,yes,no,no


In [98]:
moll_chord[moll_chord['ncRNA'] == 'yes']

Unnamed: 0,cne_id,RNA,snRNA,ncRNA
6,spha_cne_66343,yes,no,yes
9,obim_cne_6821,yes,no,yes
10,agra_cne_974,yes,no,yes
13,spha_cne_41768,yes,no,yes
15,spha_cne_68670,yes,no,yes
...,...,...,...,...
13251,lgig_cne_1756,yes,yes,yes
13254,pmax_cne_18466,yes,no,yes
13257,spha_cne_26123,yes,no,yes
13259,obim_cne_17029,yes,no,yes


In [99]:
moll_chord[moll_chord['snRNA'] == 'yes']

Unnamed: 0,cne_id,RNA,snRNA,ncRNA
121,pcan_cne_320,yes,yes,yes
219,obim_cne_16134,yes,yes,yes
611,bgla_cne_10474,yes,yes,yes
990,echl_cne_6602,yes,yes,yes
1218,obim_cne_18903,yes,yes,yes
...,...,...,...,...
12682,bgla_cne_7846,yes,yes,yes
12825,lgig_cne_3517,yes,yes,yes
13143,acal_cne_5564,yes,yes,yes
13165,obim_cne_9326,yes,yes,yes


#### Organize BLAST results

In [14]:
blastn_file = blastn_files[0]
file_df = pd.DataFrame()
blastn_df = pd.read_csv(blastn_file, sep="\t",
                   names = ['cne_id', 'description', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen',
                    'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
blastn_df

Unnamed: 0,cne_id,description,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,acal_cne_1263,Trichinella spiralis isolate ISS534 chromosome 2,gi|1755770147|gb|CP032377.1|,82.857,105,14,4,57,159,10501195,10501093,1.390000e-16,91.6
1,acal_cne_1263,Trichinella spiralis isolate TY2 chromosome 2,gi|1755770144|gb|CP032374.1|,82.857,105,14,4,57,159,10503647,10503545,1.390000e-16,91.6
2,acal_cne_1263,Trichinella spiralis isolate Shisler1 chromoso...,gi|1755770141|gb|CP032371.1|,82.857,105,14,4,57,159,10481868,10481766,1.390000e-16,91.6
3,acal_cne_1263,Caenorhabditis briggsae isolate VX34 chromosome I,gi|2202840552|gb|CP092620.1|,94.340,53,2,1,72,124,3830875,3830926,3.020000e-13,80.5
4,acal_cne_1263,Caenorhabditis briggsae isolate VX34 chromosome I,gi|2202840552|gb|CP092620.1|,92.453,53,3,1,72,124,3824164,3824113,1.400000e-11,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294074,spha_cne_93162,Rhabditophanes sp. KR3021 genome assembly Rhab...,gi|687916023|emb|LK995555.1|,95.000,40,2,0,67,106,496068,496107,2.140000e-08,63.9
294075,spha_cne_93162,Rhabditophanes sp. KR3021 genome assembly Rhab...,gi|687915983|emb|LK995699.1|,95.000,40,2,0,67,106,32255,32294,2.140000e-08,63.9
294076,spha_cne_93162,"Oscheius dolichura genome assembly, chromosome: V",gi|2211271299|emb|OW051496.1|,97.222,36,1,0,76,111,8896691,8896726,7.690000e-08,62.1
294077,spha_cne_93162,"Oscheius onirici genome assembly, chromosome: III",gi|2211271102|emb|OW051470.1|,97.222,36,1,0,78,113,9054206,9054171,7.690000e-08,62.1


In [17]:
summary_df = pd.DataFrame()
for file in blastn_files:
    query_phylum = file.split('/')[-1].split("_vs_")[0]
    target_phylum = file.split('/')[-1].split("_vs_")[1].split(".")[0]
    print("Query phylum:", query_phylum, "Target phylum:", target_phylum)
    file_df = pd.DataFrame()
    blastn_df = pd.read_csv(file, sep="\t",
                   names = ['cne_id', 'description', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen',
                    'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
    #ll_cne_ids = list(set(blastn_df['cne_id']))
    all_cne_df = blastn_df[['cne_id', 'description', 'pident', 'length', 'evalue']]
    all_cne_df['query_phylum'] = query_phylum
    all_cne_df['target_phylum'] = target_phylum
    #cluster_df = cluster_dfs[query_phylum]
    #rna_df = rna_df.merge(cluster_df, how='left')
    #rna_df['cluster_id'] = query_phylum + "_" + rna_df['cluster_id'] 
    summary_df = pd.concat([summary_df, all_cne_df], axis=0)
summary_df

Query phylum: mollusca Target phylum: nematoda


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Query phylum: cnidaria Target phylum: priapulida
Query phylum: cnidaria Target phylum: arthropoda
Query phylum: arthropoda Target phylum: annelida
Query phylum: cnidaria Target phylum: mollusca
Query phylum: arthropoda Target phylum: nemertea
Query phylum: arthropoda Target phylum: hemichordata
Query phylum: arthropoda Target phylum: rotifera
Query phylum: arthropoda Target phylum: orthonectida
Query phylum: mollusca Target phylum: platyhelminthes
Query phylum: mollusca Target phylum: chordata
Query phylum: mollusca Target phylum: tardigrada
Query phylum: cnidaria Target phylum: onychophora
Query phylum: arthropoda Target phylum: onychophora
Query phylum: arthropoda Target phylum: nematoda
Query phylum: mollusca Target phylum: ctenophora
Query phylum: cnidaria Target phylum: bryozoa
Query phylum: arthropoda Target phylum: xenacoelomorpha
Query phylum: cnidaria Target phylum: porifera
Query phylum: cnidaria Target phylum: echinodermata
Query phylum: arthropoda Target phylum: phoronida
Q

Unnamed: 0,cne_id,description,pident,length,evalue,query_phylum,target_phylum
0,acal_cne_1263,Trichinella spiralis isolate ISS534 chromosome 2,82.857,105,1.390000e-16,mollusca,nematoda
1,acal_cne_1263,Trichinella spiralis isolate TY2 chromosome 2,82.857,105,1.390000e-16,mollusca,nematoda
2,acal_cne_1263,Trichinella spiralis isolate Shisler1 chromoso...,82.857,105,1.390000e-16,mollusca,nematoda
3,acal_cne_1263,Caenorhabditis briggsae isolate VX34 chromosome I,94.340,53,3.020000e-13,mollusca,nematoda
4,acal_cne_1263,Caenorhabditis briggsae isolate VX34 chromosome I,92.453,53,1.400000e-11,mollusca,nematoda
...,...,...,...,...,...,...,...
0,hsym_cne_10568,"Southwellina hispida mitochondrion, complete g...",72.606,449,2.510000e-32,cnidaria,acanthocephala
1,hsym_cne_10568,Leptorhynchoides thecatus isolate Withbass2 cy...,73.294,337,3.270000e-26,cnidaria,acanthocephala
2,hsym_cne_10568,Pomphorhynchus tereticollis isolate Ouche mito...,81.295,139,4.230000e-25,cnidaria,acanthocephala
3,hsym_cne_10568,Pomphorhynchus tereticollis isolate Straslund ...,80.576,139,1.970000e-23,cnidaria,acanthocephala


In [18]:
summary_df

Unnamed: 0,cne_id,description,pident,length,evalue,query_phylum,target_phylum
0,acal_cne_1263,Trichinella spiralis isolate ISS534 chromosome 2,82.857,105,1.390000e-16,mollusca,nematoda
1,acal_cne_1263,Trichinella spiralis isolate TY2 chromosome 2,82.857,105,1.390000e-16,mollusca,nematoda
2,acal_cne_1263,Trichinella spiralis isolate Shisler1 chromoso...,82.857,105,1.390000e-16,mollusca,nematoda
3,acal_cne_1263,Caenorhabditis briggsae isolate VX34 chromosome I,94.340,53,3.020000e-13,mollusca,nematoda
4,acal_cne_1263,Caenorhabditis briggsae isolate VX34 chromosome I,92.453,53,1.400000e-11,mollusca,nematoda
...,...,...,...,...,...,...,...
0,hsym_cne_10568,"Southwellina hispida mitochondrion, complete g...",72.606,449,2.510000e-32,cnidaria,acanthocephala
1,hsym_cne_10568,Leptorhynchoides thecatus isolate Withbass2 cy...,73.294,337,3.270000e-26,cnidaria,acanthocephala
2,hsym_cne_10568,Pomphorhynchus tereticollis isolate Ouche mito...,81.295,139,4.230000e-25,cnidaria,acanthocephala
3,hsym_cne_10568,Pomphorhynchus tereticollis isolate Straslund ...,80.576,139,1.970000e-23,cnidaria,acanthocephala


#### Only keep BLAST hits with >75% similarity over >50 bp

In [25]:
filtered_df = summary_df.query('pident>75' and 'length> 50')
filtered_df

Unnamed: 0,cne_id,description,pident,length,evalue,query_phylum,target_phylum
0,acal_cne_1263,Trichinella spiralis isolate ISS534 chromosome 2,82.857,105,1.390000e-16,mollusca,nematoda
1,acal_cne_1263,Trichinella spiralis isolate TY2 chromosome 2,82.857,105,1.390000e-16,mollusca,nematoda
2,acal_cne_1263,Trichinella spiralis isolate Shisler1 chromoso...,82.857,105,1.390000e-16,mollusca,nematoda
3,acal_cne_1263,Caenorhabditis briggsae isolate VX34 chromosome I,94.340,53,3.020000e-13,mollusca,nematoda
4,acal_cne_1263,Caenorhabditis briggsae isolate VX34 chromosome I,92.453,53,1.400000e-11,mollusca,nematoda
...,...,...,...,...,...,...,...
0,hsym_cne_10568,"Southwellina hispida mitochondrion, complete g...",72.606,449,2.510000e-32,cnidaria,acanthocephala
1,hsym_cne_10568,Leptorhynchoides thecatus isolate Withbass2 cy...,73.294,337,3.270000e-26,cnidaria,acanthocephala
2,hsym_cne_10568,Pomphorhynchus tereticollis isolate Ouche mito...,81.295,139,4.230000e-25,cnidaria,acanthocephala
3,hsym_cne_10568,Pomphorhynchus tereticollis isolate Straslund ...,80.576,139,1.970000e-23,cnidaria,acanthocephala


### Count how many unique CNEs have a match to each phylum

In [29]:
no_dup_df = filtered_df.drop_duplicates(['cne_id', 'query_phylum', 'target_phylum'])

In [77]:
phylum_count_df = cne_count_df[['phylum', 'cne_count']].groupby('phylum').sum().reset_index().\
    rename({'phylum':'query_phylum', 'cne_count':'phylum_cne_count'}, axis=1)
phylum_count_df

Unnamed: 0,query_phylum,phylum_cne_count
0,arthropoda,124309
1,cnidaria,325463
2,mollusca,375451


In [82]:
blast_hit_count_df = no_dup_df[['cne_id', 'query_phylum', 'target_phylum']].groupby(['query_phylum', 'target_phylum' ]).count().\
    reset_index().rename({'cne_id': 'cne_count'}, axis=1)
blast_hit_count_df = blast_hit_count_df.merge(phylum_count_df)
blast_hit_count_df['pct_cnes'] = 100 *blast_hit_count_df['cne_count'] / blast_hit_count_df['phylum_cne_count']

In [83]:
blast_hit_count_df

Unnamed: 0,query_phylum,target_phylum,cne_count,phylum_cne_count,pct_cnes
0,arthropoda,acanthocephala,19,124309,0.015284
1,arthropoda,annelida,2910,124309,2.340941
2,arthropoda,arthropoda,75380,124309,60.639214
3,arthropoda,brachiopoda,422,124309,0.339477
4,arthropoda,bryozoa,1902,124309,1.530058
...,...,...,...,...,...
63,mollusca,porifera,37,375451,0.009855
64,mollusca,priapulida,101,375451,0.026901
65,mollusca,rotifera,492,375451,0.131042
66,mollusca,tardigrada,6,375451,0.001598


### Sort data frame to arrange heatmap according to phylogeny

In [86]:
sorter = ['ctenophora', 'placozoa', 'porifera', 'cnidaria', 'xenacoelomorpha', 'priapulida', 'nematoda', 
          'arthropoda', 'tardigrada', 'onychophora', 'rotifera', 'acanthocephala', 'mollusca', 'annelida', 'nemertea',
          'dicyemida', 'orthonectida','platyhelminthes', 'phoronida', 'brachiopoda', 'bryozoa', 'echinodermata', 
          'hemichordata', 'chordata']


In [87]:
blast_hit_count_df.target_phylum = blast_hit_count_df.target_phylum.astype("category")
blast_hit_count_df.target_phylum.cat.set_categories(sorter, inplace=True)
blast_hit_count_df = blast_hit_count_df.sort_values(["target_phylum"])
blast_hit_count_df

Unnamed: 0,query_phylum,target_phylum,cne_count,phylum_cne_count,pct_cnes
31,cnidaria,ctenophora,8,325463,0.002458
7,arthropoda,ctenophora,7,124309,0.005631
53,mollusca,ctenophora,8,375451,0.002131
39,cnidaria,placozoa,11,325463,0.003380
61,mollusca,placozoa,6,375451,0.001598
...,...,...,...,...,...
10,arthropoda,hemichordata,70,124309,0.056311
34,cnidaria,hemichordata,25,325463,0.007681
5,arthropoda,chordata,8564,124309,6.889284
29,cnidaria,chordata,1146,325463,0.352114


#### Write to file for plotting in R

In [88]:
blast_hit_count_df.to_csv('blast_hit_counts.tsv', sep="\t", index=False)