### Goal

Calculate stats about pre- and post-filtering CNEs
Generate a filtered CNE dictionary for downstream analyses

### Input

cne_dict: dictionary of CNEs created by generate_cne_ids.py (unique_non_overlap_cnes.txt)  
filter_file: list of CNEs to filter out. 2 column tsv file. First column: CNE_id. 2nd column: 'blastn' or 'ORF'

### Output

Table of stats: filtered_stats.tsv
Filtered CNE dictionary: filtered_cne_dict.txt

In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import os
import csv
import statistics
import json
import sys

#### Input paths

In [2]:
cne_dict = '../generate_cne_ids/unique_non_overlap_cnes.txt'
filter_file = 'combined_filter.tsv'

#### Open dictionary of non_overlapping cnes created with generate_cne_ids.py

In [3]:
print("reading dictionary of CNEs")
with open(cne_dict) as json_file:
    all_species_cne_dict = json.load(json_file)

reading dictionary of CNEs


#### Create stats data frame

In [4]:
stats_df = pd.DataFrame(columns=["species", "total_cne_count", "mean_cne_length",
                                 "filtered_cne_length"])

#### Calculate number of CNEs and CNE length

In [5]:
print("Calculating stats from CNE dict")
for species, cne_dict in all_species_cne_dict.items():
    cne_count = len(cne_dict) - 1 # cne_0 doesn't count
    cne_lengths = []
    for cne_id, cne_coords in cne_dict.items():
        if cne_coords == [0,0]: # first CNE is [0:0] because of generate_cne_ids.py.
            continue
        cne_length = cne_coords[1] - cne_coords[0]
        cne_lengths.append(cne_length)
    if cne_lengths: # Sometimes, species don't have CNEs
        mean_cne_length = round(statistics.mean(cne_lengths))
        stats_df.loc[species, 'mean_cne_length'] = mean_cne_length
    stats_df.loc[species, 'total_cne_count'] = cne_count
stats_df['species'] = stats_df.index
stats_df = stats_df.sort_values('species')

Calculating stats from CNE dict


#### Read filtered cnes

In [6]:
print("Reading file of CNEs to filter out")
filtered_cnes = pd.read_csv(filter_file, sep="\t", names = ['cne_id', 'filter_type'])

Reading file of CNEs to filter out


#### Calculate filtering stats

In [7]:
print("Gathering filtered CNE stats")
# Add species column to filtered_cnes_dataframe
filtered_cnes['species'] = filtered_cnes['cne_id'].str.split("_").str[0]
# Count number of filtered cnes for each species and create filtered counts dataframe
filtered_counts = filtered_cnes['species'].value_counts()
filtered_counts_df = pd.DataFrame({'species': filtered_counts.index, 'filtered_cne_count': filtered_counts.values})
# Add filtered counts to stats data frame
stats_df = stats_df.merge(filtered_counts_df)
for species in set(filtered_cnes['species']):
    # Create sub-dataframe for each species
    species_df = filtered_cnes[filtered_cnes['species'] == species]

Gathering filtered CNE stats


#### Make new CNE dictionary with filtered CNEs removed

In [29]:
print("Creating dictionary of fitered CNEs")
filtered_cne_ids = list(filtered_cnes['cne_id'])
filtered_cne_dict = {}
for species, cne_dict in all_species_cne_dict.items():
    filtered_cne_dict[species] = {}
    for cne_id, coords in cne_dict.items():
        if cne_id not in filtered_cne_ids and coords != [0,0]:
            filtered_cne_dict[species][cne_id] = coords

Creating dictionary of fitered CNEs


#### Calculate post-filtering stats

In [30]:
for species, cne_dict in filtered_cne_dict.items():
    filt_cne_lengths = []
    for cne_id, cne_coords in cne_dict.items():
        cne_length = cne_coords[1] - cne_coords[0]
        filt_cne_lengths.append(cne_length)
    if filt_cne_lengths:
        mean_cne_length = round(statistics.mean(filt_cne_lengths))
        stats_df.loc[stats_df['species'] == species, 'filtered_cne_length'] = mean_cne_length
stats_df['post_filter_cne_count'] = stats_df['total_cne_count'] - stats_df['filtered_cne_count']
stats_df['percent_filtered'] = 100 * stats_df['filtered_cne_count'] / stats_df['total_cne_count']
stats_df['percent_filtered'] = stats_df['percent_filtered'].astype(float).round(2)

#### Write stats table to file

In [31]:
print("Writing stats to file: filtered_stats.tsv")
stats_df.to_csv("filtered_stats.tsv", sep="\t", index=None)

Writing stats to file: filtered_stats.tsv


#### Write filtered dictionary to file

In [32]:
print("Writing dictionary of filtered CNEs to: filtered_cne_dict.txt")
with open('filtered_cne_dict.txt', 'w') as file:
     file.write(json.dumps(filtered_cne_dict)) # use 'json.loads' to do the reverse

Writing dictionary of filtered CNEs to: filtered_cne_dict.txt


In [34]:
filtered_cne_dict['adig']

{'adig_cne_1': [7653, 7740],
 'adig_cne_2': [7858, 8030],
 'adig_cne_3': [8899, 9127],
 'adig_cne_4': [18713, 18892],
 'adig_cne_5': [20774, 20982],
 'adig_cne_6': [21631, 21820],
 'adig_cne_7': [23121, 23208],
 'adig_cne_8': [26970, 27149],
 'adig_cne_9': [28027, 28139],
 'adig_cne_12': [71436, 71490],
 'adig_cne_13': [117194, 117366],
 'adig_cne_15': [117756, 117990],
 'adig_cne_16': [166949, 167003],
 'adig_cne_17': [203758, 203851],
 'adig_cne_18': [220246, 220328],
 'adig_cne_19': [225023, 225076],
 'adig_cne_20': [290718, 290778],
 'adig_cne_21': [302664, 302832],
 'adig_cne_22': [307383, 307548],
 'adig_cne_23': [333067, 333126],
 'adig_cne_24': [356708, 356926],
 'adig_cne_25': [365164, 365485],
 'adig_cne_26': [380200, 380420],
 'adig_cne_27': [405222, 405272],
 'adig_cne_28': [415695, 415750],
 'adig_cne_29': [417430, 417709],
 'adig_cne_30': [426308, 426589],
 'adig_cne_31': [426618, 426728],
 'adig_cne_32': [441507, 441631],
 'adig_cne_33': [456263, 456345],
 'adig_cne_34':