# phylip to nexus
### This was run on Brown_2012 and Li_2008 to get their Rev scripts to run
For an unknown reason, the .phy subet files for Brown and Li were not recognized by RevBayes, so I tried to see if their .nex equivalents would be recognized. In fact, they were, so I made this script to automate the creation of .nex files from .phy files.

# Contents
<ol>
<li><a href="#The-Function">The Function</a>: The end product of this Jupyter notebook</li>
<li><a href="#Apply-to-Brown">Application to Datasets</a>: Using the function on Brown and Li</li>
<li><a href="#Derivation">Derivation</a>: How I created the function; not the most readable</li>
</ol>

First, we're loading in my favoite bCU functions.

In [139]:
# REQUIREMENTS
from os import listdir

# LISTDIR_NH FUNCTION
def listdir_nh(path):
    '''
    Input: Path of directory (string)
    Output: Non-hidden files within given directory (list of strings)
    '''
    files = listdir(path)
    parsed = [file for file in files if not file.startswith(".")]
    return parsed

# The Function
This function collects the data in the first line of the .phy file, then puts it into the format of .nex files.

In [151]:
# MAIN FUNCTION
def phy_to_nex(phy, display=True):
    # READ IN DATA
    with open (phy, "r") as this_file:
        data = this_file.read()
    
    # GET INFO FROM PHY
    taxa, nchar = data.split("\n")[0].split(" ")
    split_up = data.split("\n")
    matrix = "\n".join(split_up[1:])
    
    # REFORMAT TO NEX
    intro = "#NEXUS\n\nBEGIN DATA;\n"
    dimensions = "\tDIMENSIONS NTAX="+taxa+" NCHAR="+nchar+";\n"
    end_intro = "\tFORMAT DATATYPE=DNA MISSING=? GAP=-;\nMATRIX\n"
    end = "\n;\nEND;"
    full_nex = intro+dimensions+end_intro+matrix+end
    
    # NEW FILE NAME
    new_nex = phy.split(".")[0] + ".nex"
    
    # RETURN NEW NEXUS
    if display:
        return(full_nex)
    else:
        f = open(new_nex, "w")
        f.write(full_nex)
        f.close()
        return("File written to disk for: "+new_nex)

In [152]:
# EXAMPLE NEW FILE
print(phy_to_nex("/Users/treehouse3/Edie/step_2_errors/phy_unknown/Brown_2012_phy/ND2_codon_1.phy"))

#NEXUS

BEGIN DATA;
	DIMENSIONS NTAX=38 NCHAR=337;
	FORMAT DATATYPE=DNA MISSING=? GAP=-;
MATRIX
Cyrtodactylus.annulatus.KU314944            aacaaaagaaataactccgttgcgcaataaccgaaatccccgggggaatttacgaggagactgaaaagtcagctaaacg-acaaaccaaagaacacgggcgtattccgccgaaaaagttatatcatgcctcctaataccacaaacccgcatacaggtggcacacacaacgtttagcagtcaagcacacacaacaacatacaaagatacc-gaagaaaagagactcctcagcaaaaggcatcggccctagtacatcacagcttggctcagacccctaccactttaccgtcaataacccaaaagcatccaaaccacaccaaaatacccccaaaataaat
Cyrtodactylus.philippinicus.KU304784        aaccaaacaaataactttgttgcgcaacaaccaaaatccccgagggaattcacgaggaaaccgaaaagtcagctaaaca-acagaccaaggaacatgagcatattccgtcgaaaaagcaatatcacgccacctaacactacgaacctgcctaaaggtggcacacacaaagtttagcagtcaagaatacacgacaatatacaacgatacc--aaaaagagagaatcctcgacaaacaacattggccccagtacatcacagcttagctccgaaccctaccactttaccgtcaacaaccaaaaagtatctaaacgacagcaaagtactccaaaactaaat
Gehyra.australis.AMS139934                  aactaaaaaaaaacctcagtcgcgcaacaaccaaaacccccgaggaaattcgcgcgggaattgaaaagtcagctgatttaaaaaaaaa-agaaaac

## Apply to Brown

In [155]:
# brown = "/Users/treehouse3/Edie/step_2_errors/phy_unknown/Brown_2012_phy/"
# brown_phy_paths = [brown+x for x in listdir_nh(brown)]
# [phy_to_nex(x, display=False) for x in brown_phy_paths]

['File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Brown_2012_phy/ND2_codon_1.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Brown_2012_phy/ND2_codon_2.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Brown_2012_phy/ND2_codon_3.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Brown_2012_phy/Pho_codon_1.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Brown_2012_phy/Pho_codon_2.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Brown_2012_phy/Pho_codon_3.nex']

## Apply to Li

In [154]:
# li = "/Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/"
# li_phy_paths = [li+x for x in listdir_nh(li)]
# [phy_to_nex(x, display=False) for x in li_phy_paths]

['File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/ENC1_codon_1.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/ENC1_codon_2.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/ENC1_codon_3.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/Glyt_codon_1.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/Glyt_codon_2.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/Glyt_codon_3.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/myh6_codon_1.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/myh6_codon_2.nex',
 'File written to disk for: /Users/treehouse3/Edie/step_2_errors/phy_unknown/Li_2008_phy/myh6_codon_3.nex',
 'File written to disk for: 

# Derivation

In [8]:
# READ IN THE PHYLIP FILE
file = "/Users/treehouse3/Edie/step_2_errors/data/subsets_no_outgroups/Brown_2012/ND2_codon_1.phy"
with open (file, "r") as this_file:
    data = this_file.read()

In [12]:
print(data)

38 337
Cyrtodactylus.annulatus.KU314944            aacaaaagaaataactccgttgcgcaataaccgaaatccccgggggaatttacgaggagactgaaaagtcagctaaacg-acaaaccaaagaacacgggcgtattccgccgaaaaagttatatcatgcctcctaataccacaaacccgcatacaggtggcacacacaacgtttagcagtcaagcacacacaacaacatacaaagatacc-gaagaaaagagactcctcagcaaaaggcatcggccctagtacatcacagcttggctcagacccctaccactttaccgtcaataacccaaaagcatccaaaccacaccaaaatacccccaaaataaat
Cyrtodactylus.philippinicus.KU304784        aaccaaacaaataactttgttgcgcaacaaccaaaatccccgagggaattcacgaggaaaccgaaaagtcagctaaaca-acagaccaaggaacatgagcatattccgtcgaaaaagcaatatcacgccacctaacactacgaacctgcctaaaggtggcacacacaaagtttagcagtcaagaatacacgacaatatacaacgatacc--aaaaagagagaatcctcgacaaacaacattggccccagtacatcacagcttagctccgaaccctaccactttaccgtcaacaaccaaaaagtatctaaacgacagcaaagtactccaaaactaaat
Gehyra.australis.AMS139934                  aactaaaaaaaaacctcagtcgcgcaacaaccaaaacccccgaggaaattcgcgcgggaattgaaaagtcagctgatttaaaaaaaaa-agaaaacgggcgccttcggacgaacaggccatatcaagcagcctaacaatccacactagcctgccggtggcacacacaaagcttagcagtccagct

In [18]:
taxa, nchar = data.split("\n")[0].split(" ")

In [25]:
matrix = "".join(data.split("\n")[1:])

In [26]:
print(matrix)

Cyrtodactylus.annulatus.KU314944            aacaaaagaaataactccgttgcgcaataaccgaaatccccgggggaatttacgaggagactgaaaagtcagctaaacg-acaaaccaaagaacacgggcgtattccgccgaaaaagttatatcatgcctcctaataccacaaacccgcatacaggtggcacacacaacgtttagcagtcaagcacacacaacaacatacaaagatacc-gaagaaaagagactcctcagcaaaaggcatcggccctagtacatcacagcttggctcagacccctaccactttaccgtcaataacccaaaagcatccaaaccacaccaaaatacccccaaaataaatCyrtodactylus.philippinicus.KU304784        aaccaaacaaataactttgttgcgcaacaaccaaaatccccgagggaattcacgaggaaaccgaaaagtcagctaaaca-acagaccaaggaacatgagcatattccgtcgaaaaagcaatatcacgccacctaacactacgaacctgcctaaaggtggcacacacaaagtttagcagtcaagaatacacgacaatatacaacgatacc--aaaaagagagaatcctcgacaaacaacattggccccagtacatcacagcttagctccgaaccctaccactttaccgtcaacaaccaaaaagtatctaaacgacagcaaagtactccaaaactaaatGehyra.australis.AMS139934                  aactaaaaaaaaacctcagtcgcgcaacaaccaaaacccccgaggaaattcgcgcgggaattgaaaagtcagctgatttaaaaaaaaa-agaaaacgggcgccttcggacgaacaggccatatcaagcagcctaacaatccacactagcctgccggtggcacacacaaagcttagcagtccagctcacccgaca

In [39]:
intro = "#NEXUS\n\nBEGIN DATA;\n"

In [34]:
print(intro)

#NEXUS

BEGIN DATA;


In [40]:
dimensions = "\tDIMENSIONS NTAX="+taxa+" NCHAR="+nchar+";\n"

In [30]:
print(dimensions)

	DIMENSIONS NTAX=38 NCHAR=337;


In [41]:
end_intro = "\tFORMAT DATATYPE=DNA MISSING=? GAP=-;\nMATRIX\n"

In [42]:
print(end_intro)

	FORMAT DATATYPE=DNA MISSING=? GAP=-;
MATRIX



In [43]:
end = "\n;\nEND;"

In [44]:
full_nex = intro+dimensions+end_intro+matrix+end

In [45]:
print(full_nex)

#NEXUS

BEGIN DATA;
	DIMENSIONS NTAX=38 NCHAR=337;
	FORMAT DATATYPE=DNA MISSING=? GAP=-;
MATRIX
Cyrtodactylus.annulatus.KU314944            aacaaaagaaataactccgttgcgcaataaccgaaatccccgggggaatttacgaggagactgaaaagtcagctaaacg-acaaaccaaagaacacgggcgtattccgccgaaaaagttatatcatgcctcctaataccacaaacccgcatacaggtggcacacacaacgtttagcagtcaagcacacacaacaacatacaaagatacc-gaagaaaagagactcctcagcaaaaggcatcggccctagtacatcacagcttggctcagacccctaccactttaccgtcaataacccaaaagcatccaaaccacaccaaaatacccccaaaataaatCyrtodactylus.philippinicus.KU304784        aaccaaacaaataactttgttgcgcaacaaccaaaatccccgagggaattcacgaggaaaccgaaaagtcagctaaaca-acagaccaaggaacatgagcatattccgtcgaaaaagcaatatcacgccacctaacactacgaacctgcctaaaggtggcacacacaaagtttagcagtcaagaatacacgacaatatacaacgatacc--aaaaagagagaatcctcgacaaacaacattggccccagtacatcacagcttagctccgaaccctaccactttaccgtcaacaaccaaaaagtatctaaacgacagcaaagtactccaaaactaaatGehyra.australis.AMS139934                  aactaaaaaaaaacctcagtcgcgcaacaaccaaaacccccgaggaaattcgcgcgggaattgaaaagtcagctgatttaaaaaaaaa-agaaaacgg

In [49]:
new_nex = file.split(".")[0] + ".nex"

In [None]:
f = open(script_path, "w")
f.write(humpty_dumpty)
f.close()
return("File written to disk for: "+str(script_path.split("/")[-1]))