### Goal

- Assign identifier to clusters
- Discard singletons (CNEs with no surviving homologous sequence in other species after filtering)

#### Input

merged_cne_clusters.csv (output of merge_homologous_cnes.py)

#### Output

pre_filtering_clusters.csv (input of parsimony_analysis)


In [1]:
import csv
from collections import defaultdict

In [5]:
merged_clusters = "merged_cne_clusters.csv"

In [6]:
output_clusters = "pre_filtering_clusters.csv"

In [7]:
with open(merged_clusters) as input_file:
    cne_file = csv.reader(input_file, delimiter = ',')
    with open(output_clusters, "w") as output_file:
        writer = csv.writer(output_file, delimiter=',')
        i = 1 # counter to generate cluster ids
        for row in cne_file:
            cluster_id = 'cluster_' + str(i)
            species_set = set()
            for cne in row:
                # Retrieve species name and add to species_set
                species = cne.split("_cne_")[0]
                species_set.add(species)
            if len(species_set) > 1: # Exclude clusters of one CNE
                row.insert(0, cluster_id) 
                writer.writerow(row)
                i+=1

#### Count CNEs for each species

In [8]:
sp_cne_summary = defaultdict(int)
with open(output_clusters, 'r') as csvfile:
    cne_file = csv.reader(csvfile, delimiter = ',')
    for row in cne_file:
        cnes = row[1:]
        for cne in cnes:
            species = cne.split("_cne_")[0]
            sp_cne_summary[species] += 1
sp_cne_summary

defaultdict(int,
            {'aaur': 3259,
             'adig': 62452,
             'aten': 7368,
             'chem': 3160,
             'dgig': 5391,
             'epal': 6697,
             'hsym': 9874,
             'hvul': 4091,
             'mvir': 6295,
             'nvec': 4804,
             'ofav': 44939,
             'pdam': 87139,
             'spis': 120123})