### Goal

Generate basic stats about multi-species CNE clusters

### Input

pre_filtering_clusters.csv: cluster file generated by merge_homologous_cnes.py and processed with assign_clusters_ids.ipynb


### Output

cluster_stats.tsv
cluster_counts.tsv

In [13]:
import csv
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
from collections import defaultdict
import glob

#### Cluster file path

In [15]:
cluster_file_path = '../assign_cluster_ids/pre_filtering_clusters.csv'

#### Repeat threshold

In [16]:
repeat_thresh = 10

#### Output file path

In [17]:
output_filepath = "clusters_no_repeats_" + str(repeat_thresh) + ".csv"

### CNE count per cluster

In [18]:
cne_counts = {}
with open(cluster_file_path) as csvfile:
    cne_file = csv.reader(csvfile, delimiter = ',')
    for row in cne_file:
        cluster_id = row[0]
        cne_count = len(row[1:])
        cne_counts[cluster_id] = cne_count
cne_counts_df = pd.DataFrame(cne_counts.items(), columns=['cluster_id', 'cne_count'])
cne_counts_df

Unnamed: 0,cluster_id,cne_count
0,cluster_1,52
1,cluster_2,152
2,cluster_3,151
3,cluster_4,89
4,cluster_5,1721
...,...,...
81724,cluster_81725,2
81725,cluster_81726,4
81726,cluster_81727,2
81727,cluster_81728,2


### Number of species per cluster

In [19]:
def retrieve_species_set(cluster_row):
    species_set = set()
    for cne in cluster_row:
        # Retrieve species ID and add to set
        species = cne.split("_cne_")[0]
        species_set.add(species)
    return(species_set)

In [20]:
cne_count = defaultdict(int)
sp_counts = {}
with open(cluster_file_path) as csvfile:
    cne_file = csv.reader(csvfile, delimiter = ',')
    all_cne_counts_per_sp = []
    clusters_with_repeats = []
    clusters_without_repeats = []
    for row in cne_file:
        cluster_id = row[0]
        cnes = row[1:]
        species_set = retrieve_species_set(cnes)      
        cne_counts_per_sp = defaultdict(int)
        for cne in cnes:
            species = cne.split("_cne_")[0]
            cne_counts_per_sp[species] += 1
        if any(repeat_count > repeat_thresh for repeat_count in cne_counts_per_sp.values()):
            clusters_with_repeats.append(cluster_id)
        else:
            clusters_without_repeats.append(cluster_id)
            #writer.writerow(row)
        # Append the number of species in cluster to list
        sp_counts[cluster_id] = len(species_set)
        # Append the list of number of sequence per species to final list
        all_cne_counts_per_sp.append(list(cne_counts_per_sp.values()))

In [21]:
sp_counts_df = pd.DataFrame(sp_counts.items(), columns=['cluster_id', 'sp_count'])
sp_counts_df

Unnamed: 0,cluster_id,sp_count
0,cluster_1,7
1,cluster_2,3
2,cluster_3,4
3,cluster_4,6
4,cluster_5,13
...,...,...
81724,cluster_81725,2
81725,cluster_81726,2
81726,cluster_81727,2
81727,cluster_81728,2


### Write stats to file

In [22]:
cluster_stats_df = sp_counts_df.merge(cne_counts_df)
cluster_stats_df

Unnamed: 0,cluster_id,sp_count,cne_count
0,cluster_1,7,52
1,cluster_2,3,152
2,cluster_3,4,151
3,cluster_4,6,89
4,cluster_5,13,1721
...,...,...,...
81724,cluster_81725,2,2
81725,cluster_81726,2,4
81726,cluster_81727,2,2
81727,cluster_81728,2,2


In [23]:
cluster_stats_df.to_csv("cluster_stats.tsv", sep="\t", index=False)

#### Number of clusters with >10 CNEs from same species

In [24]:
len(clusters_without_repeats)

79471

In [25]:
len(clusters_with_repeats)

2258

#### Make table of number of species per cluster

In [31]:
cne_count = defaultdict(int)
with open(cluster_file_path) as csvfile:
    with open(output_filepath,"w") as out_file:
        cne_file = csv.reader(csvfile, delimiter = ',')
        writer = csv.writer(out_file, delimiter = ',')
        sp_counts = []
        all_cne_counts_per_sp = []
        clusters_with_repeats = []
        clusters_without_repeats = []
        for row in cne_file:
            cluster_id = row[0]
            cnes = row[1:]
            species_set = retrieve_species_set(cnes)      
            cne_counts_per_sp = defaultdict(int)
            for cne in cnes:
                species = cne.split("_cne_")[0]
                cne_counts_per_sp[species] += 1
            if any(repeat_count > repeat_thresh for repeat_count in cne_counts_per_sp.values()):
                clusters_with_repeats.append(cluster_id)
            else:
                clusters_without_repeats.append(cluster_id)
                writer.writerow(row)
            # Append the number of species in cluster to list
            sp_counts.append(len(species_set))
            # Append the list of number of sequence per species to final list
            all_cne_counts_per_sp.append(list(cne_counts_per_sp.values()))

In [32]:
sp_count_dist = Counter(sp_counts)

In [33]:
species_counts_df = pd.DataFrame.from_dict(sp_count_dist, orient='index').reset_index()
species_counts_df =species_counts_df.rename(columns={'index':'Number of Species in cluster', 0:'Number of clusters'})
species_counts_df = species_counts_df.sort_values('Number of Species in cluster')

In [34]:
species_counts_df

Unnamed: 0,Number of Species in cluster,Number of clusters
6,2,74959
1,3,4962
2,4,1307
5,5,264
3,6,120
0,7,63
8,8,24
7,9,15
9,10,7
10,11,4


In [36]:
species_counts_df.to_csv("cluster_counts.tsv", index=False, sep="\t")