### Goal

Create cluster file without clusters containing only CNEs from two sister-species (because we assume that these CNEs are not convergent and originate from the commmon ancestor of these two species). Saves a lot of time for ancestral character state reconstruction step because most CNEs are from sister-species.

### Input

pre_filtering_clusters.csv: cluster CNE composition file, output of assign_cluster_ids.ipynb

### Output

pre_filtering_clusters_for_parsimony.csv: cluster CNE composition file, ready for parsimony_analysis.py

In [14]:
import csv
import pandas as pd

In [2]:
cluster_file_repeats = '../assign_cluster_ids/pre_filtering_clusters.csv' #cluster file
cluster_file_no_repeats = '../cluster_stats/clusters_no_repeats_10.csv' # clusters with > 10 CNEs from same sp. filtered out

#### Create set list of species pairs to exclude

In [3]:
#exclude_setlist = [{'cgig', 'cvir'}, {'myes', 'pmax'}, {'obim', 'ovul'}, {'bgla', 'echl'}]
exclude_setlist = [{'pdam', 'spis'}, {'epal', 'aten'}, {'mvir', 'aaur'}, {'hsym', 'chem'}]

#### Function that retrieves species IDs in cluster

In [4]:
def retrieve_species_set(cluster_row):
    species_set = set()
    for cne in cluster_row:
        #print(cne)
        # Retrieve species ID and add to set
        species = cne.split("_cne_")[0]
        species_set.add(species)
    return(species_set)

#### Function that removes clusters 

In [6]:
def remove_clusters(cluster_file):
    output_file_name = cluster_file.split("/")[-1].split(".")[0] + "_for_parsimony.csv"
    # Dict to hold excluded clusters
    # excluded_clusters['cgig_cvir'] = ['cluster_x', 'cluster_y', ...]
    excluded_clusters = {}
    for exclude_set in exclude_setlist:
        exclude_name = "_".join(sorted(exclude_set))
        excluded_clusters[exclude_name] = []
    with open(cluster_file) as input_csv:
        with open(output_file_name, "w") as parsimony_output_csv:
            cne_file = csv.reader(input_csv, delimiter = ',')
            parsimony_writer = csv.writer(parsimony_output_csv, delimiter=',')
            # Each row is a cluster
            for row in cne_file:
                cluster_id = row[0]
                cnes = row[1:]
                species_set = retrieve_species_set(cnes)
                set_id = "_".join(sorted(species_set))
                if species_set not in exclude_setlist:
                    parsimony_writer.writerow(row)
                else:
                    excluded_clusters[set_id].append(row)
        # Print number of clusters at node between species pairs - Important result
        for comp, cluster_list in excluded_clusters.items():
            print(comp)
            print(len(cluster_list))

In [8]:
remove_clusters(cluster_file_repeats)

pdam_spis
49371
aten_epal
906
aaur_mvir
792
chem_hsym
471


In [9]:
remove_clusters(cluster_file_no_repeats)

pdam_spis
49108
aten_epal
873
aaur_mvir
782
chem_hsym
434


#### Update: also output cluster:node dict for downstream analyses (e.g. transposon cluster analysis)

In [10]:
def output_excluded_clusters(cluster_file):
    #output_file_name = cluster_file.split("/")[-1].split(".")[0] + "_for_parsimony.csv"
    # Dict to hold excluded clusters
    # excluded_clusters['cgig_cvir'] = ['cluster_x', 'cluster_y', ...]
    excluded_clusters = {}
    for exclude_set in exclude_setlist:
        exclude_name = "_".join(sorted(exclude_set))
        excluded_clusters[exclude_name] = []
    with open(cluster_file) as input_csv:
        cne_file = csv.reader(input_csv, delimiter = ',')
        # Each row is a cluster
        for row in cne_file:
            cluster_id = row[0]
            cnes = row[1:]
            species_set = retrieve_species_set(cnes)
            set_id = "_".join(sorted(species_set))
            if species_set in exclude_setlist:
                excluded_clusters[set_id].append(cluster_id)
    return(excluded_clusters)

In [11]:
excluded_cluster_dict = output_excluded_clusters(cluster_file_repeats)

In [12]:
excluded_cluster_dict

{'pdam_spis': ['cluster_30687',
  'cluster_30688',
  'cluster_30689',
  'cluster_30690',
  'cluster_30691',
  'cluster_30692',
  'cluster_30693',
  'cluster_30694',
  'cluster_30695',
  'cluster_30696',
  'cluster_30697',
  'cluster_30698',
  'cluster_30699',
  'cluster_30700',
  'cluster_30701',
  'cluster_30702',
  'cluster_30703',
  'cluster_30704',
  'cluster_30705',
  'cluster_30706',
  'cluster_30707',
  'cluster_30708',
  'cluster_30709',
  'cluster_30710',
  'cluster_30711',
  'cluster_30712',
  'cluster_30713',
  'cluster_30714',
  'cluster_30715',
  'cluster_30716',
  'cluster_30717',
  'cluster_30718',
  'cluster_30719',
  'cluster_30720',
  'cluster_30721',
  'cluster_30722',
  'cluster_30723',
  'cluster_30724',
  'cluster_30725',
  'cluster_30726',
  'cluster_30727',
  'cluster_30728',
  'cluster_30729',
  'cluster_30730',
  'cluster_30731',
  'cluster_30732',
  'cluster_30733',
  'cluster_30734',
  'cluster_30735',
  'cluster_30736',
  'cluster_30737',
  'cluster_30738',

In [15]:
node_set_id_dict = {'pdam_spis': 'pocilloporidae', 'aten_epal': 'enthemonae', 'aaur_mvir': 'acraspeda',
                    'chem_hsym': 'leptothecata'}
node_set_id_df = pd.DataFrame(node_set_id_dict.items(), columns=['set_id', 'node'])
node_set_id_df

Unnamed: 0,set_id,node
0,pdam_spis,pocilloporidae
1,aten_epal,enthemonae
2,aaur_mvir,acraspeda
3,chem_hsym,leptothecata


In [16]:
excluded_cluster_nodes = {}
for set_id, cluster_list in excluded_cluster_dict.items():
    node_id = node_set_id_dict[set_id]
    for cluster_id in cluster_list:
        excluded_cluster_nodes[cluster_id]=node_id
excluded_cluster_nodes

{'cluster_30687': 'pocilloporidae',
 'cluster_30688': 'pocilloporidae',
 'cluster_30689': 'pocilloporidae',
 'cluster_30690': 'pocilloporidae',
 'cluster_30691': 'pocilloporidae',
 'cluster_30692': 'pocilloporidae',
 'cluster_30693': 'pocilloporidae',
 'cluster_30694': 'pocilloporidae',
 'cluster_30695': 'pocilloporidae',
 'cluster_30696': 'pocilloporidae',
 'cluster_30697': 'pocilloporidae',
 'cluster_30698': 'pocilloporidae',
 'cluster_30699': 'pocilloporidae',
 'cluster_30700': 'pocilloporidae',
 'cluster_30701': 'pocilloporidae',
 'cluster_30702': 'pocilloporidae',
 'cluster_30703': 'pocilloporidae',
 'cluster_30704': 'pocilloporidae',
 'cluster_30705': 'pocilloporidae',
 'cluster_30706': 'pocilloporidae',
 'cluster_30707': 'pocilloporidae',
 'cluster_30708': 'pocilloporidae',
 'cluster_30709': 'pocilloporidae',
 'cluster_30710': 'pocilloporidae',
 'cluster_30711': 'pocilloporidae',
 'cluster_30712': 'pocilloporidae',
 'cluster_30713': 'pocilloporidae',
 'cluster_30714': 'pocillopo

In [17]:
excluded_cluster_nodes_df = pd.DataFrame(excluded_cluster_nodes.items(), columns=['cluster_id', 'node'])
excluded_cluster_nodes_df

Unnamed: 0,cluster_id,node
0,cluster_30687,pocilloporidae
1,cluster_30688,pocilloporidae
2,cluster_30689,pocilloporidae
3,cluster_30690,pocilloporidae
4,cluster_30691,pocilloporidae
...,...,...
51535,cluster_5060,leptothecata
51536,cluster_5061,leptothecata
51537,cluster_5062,leptothecata
51538,cluster_5063,leptothecata


In [18]:
excluded_cluster_nodes_df.to_csv('excluded_cluster_nodes.tsv', sep="\t", index=False)