In [None]:
# Usage: parsimony_analysis_part1.py merged_clusters tree
# Goal: run ancestral character state reconstruction tool PASTML to identify potentially convergent CNEs.
# Input: 
# merged_cne_clusters.csv: results of clustering analysis, generated by merge_homologous_cnes.py
# cnidarian_tree.nwk : phylogenetic tree in Newick format
# Output: pastml_output.html, contains dictionary of character states at each node for each CNE. Needs to be parsed
### with parsimony_analysis_part2.py

from pastml.acr import pastml_pipeline
import csv
import itertools
import pandas as pd
import sys
import pickle
import ast
from collections import Counter


# #### Input files
#merged_clusters = '../../results_for_paper/cnidaria_analysis_version2/merged_cne_clusters.csv'
#tree = "cnidarian_tree.nwk"

merged_clusters = sys.argv[1]
tree = sys.argv[2]


# #### Species list
species_list = ['dgig',
                'ofav',
                'pdam',
                'spis',
                'adig', 
                'nvec',
                'epal',
                'aten', 
                'mvir',
                'aaur', 
                'chem',
                'hvul',
                'hech',
                'hsym'                                                                                       
               ]


# #### Dictionary of parent-child relationships
tax_dict = { 
    'aaur': 'acraspeda',
    'adig': 'acropora',
    'amil': 'acropora',
    'aten': 'enthemonae',
    'chem': 'hydrozoa',
    'dgig': 'anthozoa',
    'epal': 'enthemonae',
    'hech': 'hydractinia',
    'hsym': 'hydractinia',
    'hvul': 'anthoathecata',
    'mvir': 'acraspeda',
    'nvec': 'actiniaria',
    'ofav': 'robusta',
    'pdam': 'pocilloporidae',
    'spis': 'pocilloporidae',
    'pocilloporidae': 'robusta',
    'robusta': 'scleractinia',
    'acropora': 'scleractinia',
    'scleractinia': 'hexacorallia',
    'enthemonae': 'actiniaria',
    'actiniaria': 'hexacorallia',
    'hexacorallia': 'anthozoa',
    'anthozoa': 'root',
    'hydractinia': 'anthoathecata',
    'anthoathecata': 'hydrozoa',
    'hydrozoa': 'medusozoa',
    'acraspeda': 'medusozoa',
    'medusozoa': 'root'    
}


# ### Create pastml input table
# 
# Rows: species ID  
# Columns: cluster id  
# 0: species not in cluster  
# 1: species in cluster
print("Creating pasml input table, this may take some time")
pastml_data = pd.DataFrame(species_list, columns=['id'])
with open(merged_clusters) as csvfile:
    cne_file = csv.reader(csvfile, delimiter = ',')
    i = 1 # counter to generate cluster ids
    for row in cne_file:
        cluster_id = 'cluster_' + str(i)
        # species_set is the set of species in cluster
        species_set = set()
        for cne in row:
            # Retrieve species name and add to species_set
            species = cne.split("_cne_")[0]
            species_set.add(species)
        if len(species_set) > 1: # Exclude clusters of one CNE
            pastml_data[cluster_id] = 0 # Fill all species with 0
            pastml_data[cluster_id] = pastml_data[cluster_id].astype(int) # Avoid automatic convert to float
            for species in species_set: # Add 1 to each species in cluster
                pastml_data.loc[pastml_data['id'] == species, cluster_id] = 1
        i+=1
print("pastml table created. Writing to file: pastml_data.csv")
pastml_data_file = 'pastml_data.csv'
pastml_data.to_csv(pastml_data_file, index=False)
print("Done")

# Columns for which we want to reconstruct ancestral states
columns = list(pastml_data.columns)[1:] # everything except id column
# Path to the output compressed map visualisation
html_compressed = "pastml_output_compressed.html"
# (Optional) path to the output tree visualisation
html = "pastml_output.html"
print("Running pastml, this may take some time.")
pastml_pipeline(data=pastml_data_file, data_sep=',', columns=columns, name_column=columns[0], tree=tree,
                html_compressed=html_compressed, html=html, verbose=True)
print("pastml run complete.")
print("parse output html file using:")
print("grep elements pastml_output.html | sed 's/elements://g' | tr -d ' \t\n\r' | sed 's/.$//g' > pastml_output_dict.txt")
print("Then run parsimony_analysis_part2.py")

