### Goal

- Filter convergent CNEs for downstream analyses
- Summarize filering results


### Input

- convergent_clusters.pickle: pickled dictionary of convergent clusters, generated by parsimony_analysis_part2.py
- pre_filtering_clusters.csv: CNE cluster composition. Output of assign_cluster_ids.py
- coord_dir: directory of CNE coordinates (1 file per species) made with generate_cne_ids.py

### Output

- filtered CNE coordinates (1 file per species). Use for all subsequent steps.
- filtered CNE clusters (idem)

In [1]:
import pickle
import csv
import glob
import pandas as pd
from collections import defaultdict

In [2]:
# Path of pickled list of convergent clusters (convergent_clusters.pickle)
convergent_clusters_path = "../parsimony_analysis/convergent_clusters.pickle"

In [3]:
# Cluster path
cluster_path = "../assign_cluster_ids/pre_filtering_clusters.csv"

In [4]:
# Path of coord dir
coord_dir = "../generate_cne_ids/cne_coords/"

In [5]:
coord_files = [f for f in glob.glob(coord_dir + "*coords.tsv")]
print("Found ", len(coord_files), " files in : ", coord_dir)

Found  13  files in :  ../generate_cne_ids/cne_coords/


In [6]:
coord_files

['../generate_cne_ids/cne_coords/aaur_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/nvec_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/dgig_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/pdam_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/hsym_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/aten_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/mvir_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/ofav_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/chem_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/hvul_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/epal_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/adig_cne_coords.tsv',
 '../generate_cne_ids/cne_coords/spis_cne_coords.tsv']

In [7]:
with open(convergent_clusters_path, 'rb') as f:
    convergent_clusters = pickle.load(f)

In [8]:
len(convergent_clusters)

11419

In [9]:
convergent_clusters

['cluster_1009',
 'cluster_1013',
 'cluster_1014',
 'cluster_1018',
 'cluster_1025',
 'cluster_1026',
 'cluster_1036',
 'cluster_1037',
 'cluster_1041',
 'cluster_1046',
 'cluster_1048',
 'cluster_1051',
 'cluster_1052',
 'cluster_1058',
 'cluster_1059',
 'cluster_1062',
 'cluster_1068',
 'cluster_1069',
 'cluster_1074',
 'cluster_1077',
 'cluster_1078',
 'cluster_1079',
 'cluster_1081',
 'cluster_1085',
 'cluster_1088',
 'cluster_1091',
 'cluster_1097',
 'cluster_1099',
 'cluster_1102',
 'cluster_1103',
 'cluster_1104',
 'cluster_1105',
 'cluster_1109',
 'cluster_1113',
 'cluster_1116',
 'cluster_1117',
 'cluster_1120',
 'cluster_1123',
 'cluster_1125',
 'cluster_1127',
 'cluster_1138',
 'cluster_1140',
 'cluster_1141',
 'cluster_1145',
 'cluster_1149',
 'cluster_1153',
 'cluster_1157',
 'cluster_1161',
 'cluster_1162',
 'cluster_1164',
 'cluster_1165',
 'cluster_1167',
 'cluster_117',
 'cluster_1170',
 'cluster_1174',
 'cluster_1175',
 'cluster_1177',
 'cluster_1181',
 'cluster_1186'

### Identify CNEs to filter out

In [10]:
convergent_cnes = []
filtered_cnes = []
cne_count = 0
orig_cne_count_per_sp = defaultdict(int)
output_clusters = 'filtered_clusters.csv'
with open (cluster_path,'r') as csv_file:
    reader = csv.reader(csv_file)
    with open(output_clusters, "w") as output_file:
        writer = csv.writer(output_file, delimiter=',')
        for row in reader:
            cluster_id=row[0]
            for cne in row[1:]:
                species = cne.split("_cne_")[0]
                orig_cne_count_per_sp[species] += 1
            cne_count += len(row) - 1 # for summary output
            if cluster_id in convergent_clusters:
                convergent_cnes.extend(row[1:])
            else:
                filtered_cnes.extend(row[1:])
                writer.writerow(row)

In [11]:
with open (cluster_path,'r') as csv_file:
    cluster_count = sum(1 for _ in csv_file) # for summary output

In [12]:
len(convergent_cnes)

40128

In [13]:
convergent_cnes

['adig_cne_123',
 'pdam_cne_98740',
 'adig_cne_234',
 'adig_cne_44335',
 'adig_cne_79579',
 'pdam_cne_97014',
 'adig_cne_10068',
 'adig_cne_309',
 'adig_cne_74617',
 'dgig_cne_1844',
 'dgig_cne_2247',
 'dgig_cne_2725',
 'dgig_cne_3381',
 'dgig_cne_3398',
 'pdam_cne_98923',
 'adig_cne_328',
 'adig_cne_330',
 'pdam_cne_69649',
 'adig_cne_358',
 'adig_cne_77213',
 'pdam_cne_93975',
 'adig_cne_10417',
 'adig_cne_10421',
 'adig_cne_15295',
 'adig_cne_16491',
 'adig_cne_16494',
 'adig_cne_16956',
 'adig_cne_16958',
 'adig_cne_17782',
 'adig_cne_18294',
 'adig_cne_19600',
 'adig_cne_20595',
 'adig_cne_23133',
 'adig_cne_23134',
 'adig_cne_23135',
 'adig_cne_248',
 'adig_cne_39717',
 'adig_cne_41582',
 'adig_cne_46591',
 'adig_cne_47534',
 'adig_cne_47535',
 'adig_cne_48592',
 'adig_cne_491',
 'adig_cne_55424',
 'adig_cne_55587',
 'adig_cne_56417',
 'adig_cne_56418',
 'adig_cne_64583',
 'adig_cne_65414',
 'adig_cne_66247',
 'adig_cne_67739',
 'adig_cne_67809',
 'adig_cne_68055',
 'adig_cne_754

In [14]:
with open('convergent_cnes.pickle', 'wb') as handle:
    pickle.dump(convergent_cnes, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
filtered_cnes

['adig_cne_21',
 'adig_cne_26279',
 'adig_cne_57',
 'aten_cne_3077',
 'aten_cne_4904',
 'aten_cne_6242',
 'aten_cne_6390',
 'aten_cne_7490',
 'aten_cne_7492',
 'aten_cne_7627',
 'aten_cne_7631',
 'aten_cne_8798',
 'dgig_cne_4494',
 'dgig_cne_4495',
 'dgig_cne_5400',
 'dgig_cne_5401',
 'dgig_cne_6050',
 'epal_cne_2173',
 'epal_cne_2174',
 'epal_cne_303',
 'epal_cne_6751',
 'ofav_cne_24625',
 'ofav_cne_24626',
 'ofav_cne_24627',
 'ofav_cne_24628',
 'ofav_cne_24629',
 'ofav_cne_24630',
 'ofav_cne_24631',
 'ofav_cne_24644',
 'ofav_cne_24645',
 'ofav_cne_47939',
 'pdam_cne_18340',
 'pdam_cne_28954',
 'pdam_cne_62276',
 'pdam_cne_62277',
 'pdam_cne_62278',
 'pdam_cne_62280',
 'pdam_cne_62281',
 'spis_cne_13146',
 'spis_cne_136216',
 'spis_cne_136217',
 'spis_cne_136218',
 'spis_cne_154140',
 'spis_cne_18236',
 'spis_cne_2361',
 'spis_cne_63969',
 'spis_cne_85117',
 'spis_cne_85120',
 'spis_cne_85121',
 'spis_cne_85124',
 'spis_cne_85126',
 'spis_cne_85127',
 'adig_cne_13312',
 'adig_cne_1700

In [16]:
len(filtered_cnes)

325464

In [17]:
cne_count

365592

In [18]:
orig_cne_count_per_sp

defaultdict(int,
            {'aaur': 3259,
             'adig': 62452,
             'aten': 7368,
             'chem': 3160,
             'dgig': 5391,
             'epal': 6697,
             'hsym': 9874,
             'hvul': 4091,
             'mvir': 6295,
             'nvec': 4804,
             'ofav': 44939,
             'pdam': 87139,
             'spis': 120123})

#### Count convergent CNEs for each species

In [19]:
conv_count_per_species = defaultdict(int)
for cne in convergent_cnes:
    species = cne.split("_cne_")[0]
    conv_count_per_species[species] += 1
conv_count_per_species

defaultdict(int,
            {'aaur': 1210,
             'adig': 11641,
             'aten': 1792,
             'chem': 808,
             'dgig': 1747,
             'epal': 1384,
             'hsym': 3342,
             'hvul': 1551,
             'mvir': 3565,
             'nvec': 980,
             'ofav': 985,
             'pdam': 1941,
             'spis': 9182})

#### Count filtered CNEs

In [20]:
filt_count_per_species = defaultdict(int)
for cne in filtered_cnes:
    species = cne.split("_cne_")[0]
    filt_count_per_species[species] += 1
filt_count_per_species

defaultdict(int,
            {'aaur': 2049,
             'adig': 50811,
             'aten': 5576,
             'chem': 2352,
             'dgig': 3644,
             'epal': 5313,
             'hsym': 6532,
             'hvul': 2540,
             'mvir': 2730,
             'nvec': 3824,
             'ofav': 43954,
             'pdam': 85198,
             'spis': 110941})

#### Filter CNEs

In [21]:
for coord_file in coord_files:
    species = coord_file.split("/")[-1].split("_")[0]
    output_filename = species + "_coords_filtered.tsv"
    coord_df = pd.read_csv(coord_file, sep="\t", names=["cne_id", "start", "end"])
    output_df = coord_df[coord_df.cne_id != species + "_cne_0"]
    output_df = output_df[output_df.cne_id.isin(filtered_cnes)]
    output_df.to_csv(output_filename, sep="\t", index=False, header=False)

#### Write summary file

In [22]:
count_per_sp_summary = str() 
for sp, count in orig_cne_count_per_sp.items():
    count_per_sp_summary += sp + ": " + str(count) + "\n"
count_per_sp_summary

'adig: 62452\naten: 7368\ndgig: 5391\nepal: 6697\nofav: 44939\npdam: 87139\nspis: 120123\naaur: 3259\nchem: 3160\nhsym: 9874\nhvul: 4091\nmvir: 6295\nnvec: 4804\n'

In [23]:
conv_count_summary =  str() 
for sp, count in conv_count_per_species.items():
    conv_count_summary += sp + ": " + str(count) + "\n"
conv_count_summary

'adig: 11641\npdam: 1941\ndgig: 1747\naten: 1792\naaur: 1210\nepal: 1384\nchem: 808\nnvec: 980\nhsym: 3342\nspis: 9182\nhvul: 1551\nmvir: 3565\nofav: 985\n'

In [24]:
filt_count_summary =  str() 
for sp, count in filt_count_per_species.items():
    filt_count_summary += sp + ": " + str(count) + "\n"
filt_count_summary

'adig: 50811\naten: 5576\ndgig: 3644\nepal: 5313\nofav: 43954\npdam: 85198\nspis: 110941\naaur: 2049\nchem: 2352\nhsym: 6532\nhvul: 2540\nmvir: 2730\nnvec: 3824\n'

In [25]:
with open("parsimony_filtering_summary.txt", 'w') as outfile:
    outfile.write("Original cluster file: " + cluster_path + "\n" +
                 "Original number of clusters: " + str(cluster_count) + "\n" +
                  "Original number of CNEs (no singletons): " + str(cne_count) + "\n" +
                  "Original CNE count per species: " + "\n" +
                  count_per_sp_summary +
                  "Number of convergent clusters: " + str(len(convergent_clusters)) + "\n" +
                  "Convergent CNE count per species: " + "\n" +
                  conv_count_summary + 
                  "Remaining CNEs after convergent filtering: " + str(len(filtered_cnes)) + "\n" +
                  "Post filtering CNE count per species: " + "\n" +
                  filt_count_summary
                 )

### Write summary table

In [34]:
orig_cne_count_df = pd.DataFrame(orig_cne_count_per_sp.items(), columns=['species', 'orig_cne_count'])
conv_count_df = pd.DataFrame(conv_count_per_species.items(), columns=['species', 'conv_cne_count'])
filt_count_df = pd.DataFrame(filt_count_per_species.items(), columns=['species', 'final_cne_count'])
sumamry_df = orig_cne_count_df.merge(conv_count_df).merge(filt_count_df).sort_values('species')
sumamry_df

Unnamed: 0,species,orig_cne_count,conv_cne_count,final_cne_count
7,aaur,3259,1210,2049
0,adig,62452,11641,50811
1,aten,7368,1792,5576
8,chem,3160,808,2352
2,dgig,5391,1747,3644
3,epal,6697,1384,5313
9,hsym,9874,3342,6532
10,hvul,4091,1551,2540
11,mvir,6295,3565,2730
12,nvec,4804,980,3824


In [35]:
sumamry_df.to_csv("post_parsimony_filt_summary.tsv", sep="\t", index=False)