In [18]:
# %load LuiseUtil.py
import os
import shutil
import math
from ete3 import Tree
from Bio import Phylo
import matplotlib
import matplotlib.pyplot as plt
from codecs import decode
import struct


tree_dir = '../data/trees/'
alignment_dir = '../data/language_alignments/'
sitelh_dir = '../data/siteLH/'
drawings_dir = '../output/drawings/'
weight_calibration_dir = '../data/weight_calibration/'
site_congruence_dir = '../data/site_congruence/'
lh_dir = '../data/lh/'

#tree_space_name = 'space.trees'
geo_tree_name = "geo_science.tree"
cognate_tree_name = "cognate_ie_compatible.tree"
cognate_ml_tree_name = "cognate_ml.tree"

morpho_alignment_name = "morpho.phy"



def read_trees_from_ete(tree_set_names):
    trees = []
    for tree_set in tree_set_names:
        l_file = open(tree_dir + tree_set, 'r')
        lines = l_file.readlines()
        for line in lines:
            trees.append(Tree(line))
    return trees

#def read_tree_space_ete():
#    return read_trees_from_ete([tree_space_name])


def eliminate_topological_duplicates_ete(tree_set_name):
    unique_list = []
    tree_set_ete = read_trees_from_ete([tree_set_name])
    i = 0
    for t1 in tree_set_ete:
        print("tree " + str(i))
        unique = True
        for t2 in unique_list:
            rf = rf_distance_ete(t1, t2)
            if rf == 0:
                unique = False
                break
        if unique:
            unique_list.append(t1)
            print(str(len(unique_list)) + " unique trees")
        i = i + 1
    file_name = tree_dir + rm_end(tree_set_name) + '_unique.trees'
    with open(file_name, 'w+') as tree_file:
        for tree in unique_list:
            tree_file.write(tree.write()+"\n")



#def create_tree_space_from(tree_set_names):
#    tree_space_ete = read_trees_from_ete(tree_set_names)
#    file_name = tree_dir + tree_space_name
#    with open(file_name, 'w+') as tree_file:
#        for tree in tree_space_ete:
#            tree_file.write(tree.write()+"\n")
#    print(str(len(tree_space_ete)) + " trees written to " + file_name)

def read_geo_tree_ete():
    return Tree(tree_dir + geo_tree_name)

def read_cognate_tree_ete():
    return Tree(tree_dir + cognate_tree_name)

def rf_distance_ete(t1, t2):
    rf, max_rf, common_leaves, parts_t1, parts_t2,discard_t1, discart_t2 = t1.robinson_foulds(t2, unrooted_trees = True)
    if max_rf == 0:
        print("?!")
        return 0
    return rf/max_rf

def rf_distances_ete(ref_tree, tree_set):
    distances = []
    for tree in tree_set:
        distances.append(rf_distance_ete(ref_tree, tree))
    return distances


def calculate_rf_distances_raxml(ref_tree_name, tree_set_names):
    shutil.rmtree("temp/", ignore_errors=True)
    os.mkdir("temp/")
    dir_string = tree_dir + ref_tree_name
    for tree_set in tree_set_names:
        dir_string = dir_string + tree_dir + tree_set
    os.system("cat " + dir_string + " > temp/all.trees")
    os.system("./../raxml-ng/build/bin/raxml-ng --rfdist --tree temp/all.trees --prefix temp/foo > temp/bar.txt")
    l_file = open('temp/foo.raxml.rfDistances', 'r')
    lines = l_file.readlines()
    i = 0
    line = lines[i].split("\t")
    distances = []
    while(line[0] == '0'):
        distances.append(float(line[3]))
        i+=1
        line = lines[i].split("\t")
    shutil.rmtree("temp/", ignore_errors=True)
    return distances


def evaluate_lh_raxml(tree_name, alignment_name, optimize = True):
    optimize_string = ""
    if not optimize:
        optimize_string = " --opt-branches off "
    os.system('./../raxml-ng/build/bin/raxml-ng --evaluate --msa ' + alignment_dir + alignment_name +
            ' --threads 2 --model BIN+G --tree '  + tree_dir + tree_name +  ' --prefix foo --nofiles' +
              optimize_string + '> out.txt')
    l_file = open('out.txt', 'r')
    lines = l_file.readlines()
    lh = 0
    for line in lines:
        if(line.startswith('Final LogLikelihood:')):
            lh = float(line.split(" ")[2].strip())
    os.remove("out.txt")
    return lh




def calculate_site_lh_raxml_ete(tree_ete, alignment_name, optimize= True):
    shutil.rmtree("temp/", ignore_errors=True)
    os.mkdir("temp/")
    tree_ete.write(outfile="temp/foo.tree")
    optimize_string = ""
    if not optimize:
        optimize_string = " --opt-branches off "
    os.system('./../raxml-ng/build/bin/raxml-ng --sitelh --msa ' + alignment_dir + alignment_name +
            ' --threads 2 --model BIN+G --tree temp/foo.tree --prefix temp/foo ' +
              optimize_string + '> temp/bar.txt')
    l_file = open('temp/foo.raxml.log', 'r')
    lines = l_file.readlines()
    lh = 0
    for line in lines:
        if(line.startswith('Final LogLikelihood:')):
            lh = float(line.split(" ")[2].strip())
    with open('temp/foo.raxml.siteLH' , 'r') as file:
        data = (file.read().replace('\n', '')).split(" ")
    #siteLH = [float(data[i]) for i in range(5, len(data))]
    siteLH = [data[i] for i in range(5, len(data))]
    shutil.rmtree("temp/", ignore_errors=True)
    return [lh, siteLH]




def print_tree_with_phylo(tree_name, save = False):
    tree = Phylo.read(tree_dir + tree_name, "newick")
    tree.ladderize()
    fig = plt.figure(figsize=(10, 10), dpi=100)
    axes = fig.add_subplot(1, 1, 1)
    axes.set_title(tree_name)
    Phylo.draw(tree, axes=axes, do_show=False)
    if save:
        plt.savefig(drawings_dir + tree_name + '.png', dpi=fig.dpi)


def fix_beast_output(tree_set_name):
    beast_file = open(tree_dir + tree_set_name, 'r')
    lines = beast_file.readlines()
    i = 0
    while not lines[i].startswith("\tTranslate"):
        i = i+1
    translate = []
    while not lines[i].startswith(";"):
        if lines[i].endswith(",\n"):
            translate.append(lines[i].split(" ")[-1][:-2])
        else:
            translate.append(lines[i].split(" ")[-1][:-1])
        i=i+1
    i = i+1
    with open(tree_dir + rm_end(tree_set_name) + "_fixed.trees" , 'w+') as fixed_file:
        for j in range(i, len(lines)-1):
            tree = Tree(lines[j].split(" ")[-1])
            for leaf in tree.iter_leaves():
                leaf.name = translate[int(leaf.name)]
            fixed_file.write(tree.write() + "\n")

def rm_end(file_name):
    return '.'.join(file_name.split('.') [:-1])

def lh_file_name(tree_set_name, alignment_name, optimize):
    optimize_string = "_opt-branches="
    if optimize:
        optimize_string = optimize_string + "on"
    else:
        optimize_string = optimize_string + "off"
    return lh_dir + rm_end(alignment_name) + '_' + rm_end(tree_set_name)  + optimize_string + '.lh'

def lh_raw_file_name(tree_set_name, alignment_name, optimize):
    optimize_string = "_opt-branches="
    if optimize:
        optimize_string = optimize_string + "on"
    else:
        optimize_string = optimize_string + "off"
    return lh_dir + rm_end(alignment_name) + '_' + rm_end(tree_set_name)  + optimize_string + '_raw.lh'


def site_lh_file_name(tree_name, alignment_name, optimize):
    optimize_string = "_opt-branches="
    if optimize:
        optimize_string = optimize_string + "on"
    else:
        optimize_string = optimize_string + "off"
    return sitelh_dir + rm_end(alignment_name) + '_' + rm_end(tree_name)  + optimize_string + '.raxml.siteLH'

def site_lh_raw_file_name(tree_name, alignment_name, optimize):
    optimize_string = "_opt-branches="
    if optimize:
        optimize_string = optimize_string + "on"
    else:
        optimize_string = optimize_string + "off"
    return sitelh_dir + rm_end(alignment_name) + '_' + rm_end(tree_name)  + optimize_string + '_raw.raxml.siteLH'

def weight_calibration_file_name(tree_name, alignment_name):
    return weight_calibration_dir + rm_end(alignment_name) + '_' + rm_end(tree_name)  + '.raxml.weightCalibration'

def site_congruence_file_name(tree_name, alignment_name):
    return site_congruence_dir + rm_end(alignment_name) + '_' + rm_end(tree_name)  + '.raxml.siteCongruence'

def optimized_tree_file_name(tree_name, alignment_name):
    return tree_dir + rm_end(tree_name) + '_optimized_' + rm_end(alignment_name)  + '.tree'


def read_lhs(tree_set_name, alignment_name, optimize):
    lhs = []
    site_lhs = []
    with open(lh_file_name(tree_set_name, alignment_name, optimize) , 'r') as file:
        lines = file.read().split("\n")
    for line in lines[:-1]:
        line_data = line.split("\t")
        lhs.append(float(line_data[0]))
        site_lhs.append([float(el) for el in line_data[1].split(" ")[:-1]])
    return (lhs, site_lhs)

def read_lhs_raw(tree_set_name, alignment_name, optimize):
    lhs = []
    site_lhs = []
    with open(lh_raw_file_name(tree_set_name, alignment_name, optimize) , 'r') as file:
        lines = file.read().split("\n")
    for line in lines[:-1]:
        line_data = line.split("\t")
        lhs.append(float(line_data[0]))
        site_lhs.append([bin_to_float(el) for el in line_data[1].split(" ")[:-1]])
    return (lhs, site_lhs)

def read_site_lh(tree_name, alignment_name, optimize):
    with open(site_lh_file_name(tree_name, alignment_name, optimize) , 'r') as file:
        data = (file.read().replace('\n', '')).split(" ")
    return [float(data[i]) for i in range(5, len(data))]

def read_site_lh_raw(tree_name, alignment_name, optimize):
    with open(site_lh_raw_file_name(tree_name, alignment_name, optimize) , 'r') as file:
        data = (file.read().replace('\n', '')).split(" ")
    return [bin_to_float(data[i]) for i in range(5, len(data))]

def read_weight_calibration(tree_name, alignment_name):
    with open(weight_calibration_file_name(tree_name, alignment_name) , 'r') as file:
        data = file.read().split(" ")
    return [int(data[i]) for i in range(len(data) - 1)]

def read_site_congruence(tree_name, alignment_name):
    with open(site_congruence_file_name(tree_name, alignment_name) , 'r') as file:
        data = file.read().split("\n")
    return [float(data[i].split(" ")[1]) for i in range(len(data) - 1)]

def read_optimized_tree(tree_name, alignment_name):
    return Tree(optimized_tree_file_name(tree_name, alignment_name))

def calculate_lhs_raxml(tree_set_name, alignment_name, optimize = False):
    tree_set = read_trees_from_ete([tree_set_name])
    results = [calculate_site_lh_raxml_ete(tree, alignment_name, optimize) for tree in tree_set]
    with open(lh_raw_file_name(tree_set_name, alignment_name, optimize), 'w+') as out_file:
        for result in results:
            out_file.write(str(result[0]))
            out_file.write("\t")
            for site_lh in result[1]:
                #out_file.write(str(site_lh))
                out_file.write(site_lh)
                out_file.write(" ")
            out_file.write("\n")

def calculate_site_lh_raxml(tree_name, alignment_name, optimize= True):
    shutil.rmtree("temp/", ignore_errors=True)
    os.mkdir("temp/")
    optimize_string = ""
    if not optimize:
        optimize_string = " --opt-branches off "
    os.system('./../raxml-ng/build/bin/raxml-ng --sitelh --msa ' + alignment_dir + alignment_name +
            ' --threads 2 --model BIN+G --tree '  + tree_dir + tree_name +  ' --prefix temp/foo ' +
              optimize_string + '> temp/bar.txt')
    l_file = open('temp/foo.raxml.log', 'r')
    lines = l_file.readlines()
    lh = 0
    for line in lines:
        if(line.startswith('Final LogLikelihood:')):
            lh = float(line.split(" ")[2].strip())
    #os.system("cat temp/foo.raxml.siteLH > " + site_lh_file_name(tree_name, alignment_name, optimize))
    os.system("cat temp/foo.raxml.siteLH > " + site_lh_raw_file_name(tree_name, alignment_name, optimize))
    with open('temp/foo.raxml.siteLH' , 'r') as file:
        data = (file.read().replace('\n', '')).split(" ")
    #siteLH = [float(data[i]) for i in range(5, len(data))]
    siteLH = [data[i] for i in range(5, len(data))]
    shutil.rmtree("temp/", ignore_errors=True)

def calculate_weight_calibration_raxml(tree_name, alignment_name):
    os.system('./../standard-RAxML-master/raxmlHPC -f u -p 12345 -t ' + tree_dir + tree_name +
              ' -m BINGAMMA -s ' + alignment_dir + alignment_name +
              ' -n calibration > bar.txt')
    os.system('cat RAxML_weights.calibration > '
              + weight_calibration_file_name(tree_name, alignment_name))
    os.remove('bar.txt')
    os.remove('RAxML_weights.calibration')
    os.remove('RAxML_info.calibration')

def calculate_site_congruence_raxml(tree_name, alignment_name):
    os.system('./../standard-RAxML-master/raxmlHPC-AVX -f S -t ' + tree_dir + tree_name +
              ' -m BINGAMMA -s ' + alignment_dir + alignment_name +
              ' -n congruence > bar.txt')
    os.system('cat RAxML_SiteSpecificPlacementBias.congruence > '
              + site_congruence_file_name(tree_name, alignment_name))
    os.remove('bar.txt')
    os.remove('RAxML_SiteSpecificPlacementBias.congruence')
    os.remove('RAxML_info.congruence')

def calculate_optimized_tree_raxml(tree_name, alignment_name):
    shutil.rmtree("temp/", ignore_errors=True)
    os.mkdir("temp/")
    os.system('./../raxml-ng/build/bin/raxml-ng --evaluate --msa ' + alignment_dir + alignment_name +
            ' --threads 2 --model BIN+G --tree '  + tree_dir + tree_name +  ' --prefix temp/foo ' + '> out.txt')
    os.system('cat temp/foo.raxml.bestTree > ' + tree_dir + rm_end(tree_name) + "_optimized_" + rm_end(alignment_name) + '.tree')
    shutil.rmtree("temp/", ignore_errors=True)


def get_site_lh(tree_name, alignment_name, optimize):
    if not os.path.isfile(site_lh_file_name(tree_name, alignment_name, optimize)):
        print("Currently not possible, use site_lh_raw")
        #calculate_site_lh_raxml(tree_name, alignment_name, optimize)
    return read_site_lh(tree_name, alignment_name, optimize)

def get_site_lh_raw(tree_name, alignment_name, optimize):
    if not os.path.isfile(site_lh_raw_file_name(tree_name, alignment_name, optimize)):
        calculate_site_lh_raxml(tree_name, alignment_name, optimize)
    return read_site_lh_raw(tree_name, alignment_name, optimize)


def get_weight_calibration(tree_name, alignment_name):
    if not os.path.isfile(weight_calibration_file_name(tree_name, alignment_name)):
        calculate_weight_calibration_raxml(tree_name, alignment_name)
    return read_weight_calibration(tree_name, alignment_name)

def get_site_congruence(tree_name, alignment_name):
    if not os.path.isfile(site_congruence_file_name(tree_name, alignment_name)):
        calculate_site_congruence_raxml(tree_name, alignment_name)
    return read_site_congruence(tree_name, alignment_name)

def get_optimized_tree(tree_name, alignment_name):
    if not os.path.isfile(optimized_tree_file_name(tree_name, alignment_name)):
        calculate_optimized_tree_raxml(tree_name, alignment_name)
    return read_optimized_tree(tree_name, alignment_name)

def get_double_optimized_tree(tree_name, alignment_name):
    if not os.path.isfile(optimized_tree_file_name(tree_name, alignment_name)):
        calculate_optimized_tree_raxml(tree_name, alignment_name)
    optimized_tree_name = optimized_tree_file_name(tree_name, alignment_name).split('/')[-1]
    if not os.path.isfile(optimized_tree_file_name(optimized_tree_name, alignment_name)):
        calculate_optimized_tree_raxml(optimized_tree_name, alignment_name)
    return read_optimized_tree(optimized_tree_name, alignment_name)


def get_optimized_tree(tree_name, alignment_name):
    if not os.path.isfile(optimized_tree_file_name(tree_name, alignment_name)):
        calculate_optimized_tree_raxml(tree_name, alignment_name)
    return read_optimized_tree(tree_name, alignment_name)


def get_lhs(tree_set_name, alignment_name, optimize):
    if not os.path.isfile(lh_file_name(tree_set_name, alignment_name, optimize)):
        #calculate_lhs_raxml(tree_set_name, alignment_name, optimize)
        print("Currently not possible, use lhs_raw")
    return read_lhs(tree_set_name, alignment_name, optimize)

def get_lhs_raw(tree_set_name, alignment_name, optimize):
    if not os.path.isfile(lh_raw_file_name(tree_set_name, alignment_name, optimize)):
        calculate_lhs_raxml(tree_set_name, alignment_name, optimize)
    return read_lhs_raw(tree_set_name, alignment_name, optimize)


def average_branch_length(tree_set):
    avg = 0
    cnt = 0
    for tree in tree_set:
        for node in tree.traverse():
            avg = avg + node.dist
            cnt = cnt + 1
    avg = avg / cnt
    return avg



def interval_branch_length(tree_set):
    lower = 1
    upper = 0
    for tree in tree_set:
        for node in tree.traverse():
            lower = min(lower, node.dist)
            upper = max(upper, node.dist)
    return (lower, upper)



def variance_branch_length(tree_set):
    avg = average_branch_length(tree_set)
    var = 0
    cnt = 0
    for tree in tree_set:
        for node in tree.traverse():
            diff = node.dist - avg
            var = var + (diff * diff)
            cnt = cnt + 1
    return var / cnt


def bin_to_float(b):
    return struct.unpack('>d', decode('%%0%dx' % (8 << 1) % int(b, 2), 'hex')[-8:])[0]







In [19]:
# %load iqtree_statstest_parser.py
import regex
import warnings

# define some regex stuff
blanks = r"\s+"  # matches >=1  subsequent whitespace characters
sign = r"[-+]?"  # contains either a '-' or a '+' symbol or none of both
# matches ints or floats of forms '1.105' or '1.105e-5' or '1.105e5' or '1.105e+5'
float_re = r"\d+(?:\.\d+)?(?:[e][-+]?\d+)?"

tree_id_re = r"\d+"  # tree ID is an int
llh_re = rf"{sign}{float_re}"  # likelihood is a signed floating point
deltaL_re = rf"{sign}{float_re}"  # deltaL is a signed floating point
# test result entry is of form '0.123 +'
test_result_re = rf"{float_re}{blanks}{sign}"

stat_test_name = r"[a-zA-Z-]+"

# table header is of form:
# Tree      logL    deltaL  bp-RELL    p-KH     p-SH    p-WKH    p-WSH       c-ELW       p-AU
table_header = rf"Tree{blanks}logL{blanks}deltaL{blanks}(?:({stat_test_name})\s*)*"
table_header_re = regex.compile(table_header)

# a table entry in the .iqtree file looks for example like this:
# 5 -5708.931281 1.7785e-06  0.0051 -  0.498 +  0.987 +  0.498 +  0.987 +      0.05 +    0.453 +
table_entry = rf"({tree_id_re}){blanks}({llh_re}){blanks}({deltaL_re}){blanks}(?:({test_result_re})\s*)*"
table_entry_re = regex.compile(table_entry)

START_STRING = "USER TREES"
END_STRING = "TIME STAMP"


def get_relevant_section(input_file):
    """
    Returns the content of input_file between START_STRING and END_STRING.

    Args:
        input_file: Path to the iqtree test summary file.

    Returns:
        String containing the content between START_STRING and END_STRING.

    Raises:
        ValueError if the section between START_STRING and END_STRING is empty.
    """
    with open(input_file) as f:
        content = f.readlines()

    # now let's find the relevant lines
    # the relevant lines are only between the start and end string
    start = 0
    end = 0

    for i, line in enumerate(content):
        if START_STRING in line:
            start = i
        if END_STRING in line:
            end = i

    if start == end:
        raise ValueError(
            f"The section between START_STRING {START_STRING} and END_STRING {END_STRING} is empty. Please check the input file {input_file}."
        )

    return content[start:end]


def get_names_of_performed_tests(table_section):
    """
    Returns the names of the performed iqtree tests as stated in the table header.

    Args:
        table_section: String containing the iqtree test result table.

    Returns:
        A list of strings, each string is the name of a performed statistical test.

    Raises:
        ValueError if the section does not contain a table header matching the defined regex.
    """
    test_names = []

    for line in table_section:
        line = line.strip()
        m = regex.match(table_header_re, line)
        if m:
            # m captures 2 groups: the first is (Tree, logL, deltaL), the second are the tests
            test_names = m.captures(1)

    if not test_names:
        raise ValueError(
            "No line in the given section matches the regex. Compare the regex and the given section. Maybe the format has changed."
        )
    return test_names


def get_cleaned_table_entries(
    table_section):
    """
    Returns the content of the table in the given section.

    Args:
        table_section: String containing the iqtree test result table.

    Returns:
        A list of tuples, each containing the tree_id, llh, deltaL and a list of test results.

    Raises:
        ValueError if the section does not contain table entries matching the defined regex.
    """
    entries = []
    for line in table_section:
        line = line.strip()
        # match the line against the regex defined above for a table entry
        m = regex.match(table_entry_re, line)
        if m:
            # if a match was found: capture the results in variables
            tree_id, llh, deltaL, result_group = m.groups()
            # to capture all test results individually we have to explicitly unpack it
            test_results = m.captures(4)
            entry = (int(tree_id), float(llh), float(deltaL), test_results)
            entries.append(entry)

    if not entries:
        raise ValueError(
            "No line in the given section matches the regex. Compare the regex and the given section. Maybe the format has changed."
        )

    return entries


def _get_default_entry():
    return {
                "deltaL": 0,
                "plausible": 1,
                "tests": {
                    'bp-RELL': {
                        'score': 1,
                        'significant': True
                    },
                    'p-KH': {
                        'score': 1,
                        'significant': True
                    },
                    'p-SH': {
                        'score': 1,
                        'significant': True
                    },
                    'p-WKH': {
                        'score': 1,
                        'significant': True
                    },
                    'p-WSH': {
                        'score': 1,
                        'significant': True
                    },
                    'c-ELW': {
                        'score': 1,
                        'significant': True
                    },
                    'p-AU': {
                        'score': 1,
                        'significant': True
                    }
                }
            }


def get_iqtree_results(iqtree_file):
    """
    Returns a list of dicts, each dict contains the iqtree test results for the respective tree.

    Args:
        iqtree_file: Path to the iqtree test summary file.

    Returns:
        A list of dicts. Each dict contains the tree_id, llh, deltaL and all results of the performed
            iqtree tests.
    """
    section = get_relevant_section(iqtree_file)
    try:
        entries = get_cleaned_table_entries(section)
        test_names = get_names_of_performed_tests(section)
    except ValueError as e:
        warnings.warn(str(e))
        warnings.warn("Falling back to default case.")
        return [_get_default_entry()]

    results = []

    for tree_id, llh, deltaL, test_results in entries:
        assert len(test_names) == len(test_results)

        data = {}
        data["logL"] = llh
        data["tree_id"] = tree_id
        data["deltaL"] = deltaL
        data["tests"] = {}

        num_passed = 0

        for i, test in enumerate(test_names):
            test_result = test_results[i]
            score, significant = test_result.split(" ")
            score = score.strip()
            significant = significant.strip()
            data["tests"][test] = {}
            data["tests"][test]["score"] = float(score)
            data["tests"][test]["significant"] = True if significant == "+" else False

            if data["tests"][test]["significant"]:
                num_passed += 1

        data["plausible"] = num_passed == len(data["tests"].keys())

        results.append(data)
    return results


def get_iqtree_results_for_eval_tree_str(iqtree_results, eval_tree_str, clusters):
    # returns the results for this eval_tree_id as well as the cluster ID
    for i, cluster in enumerate(clusters):
        if eval_tree_str.strip() in cluster:
            return iqtree_results[i], i

    raise ValueError("This newick_string belongs to no cluster. newick_str: ", eval_tree_str[:10])
    
def get_clusters(iqtree_file):
    with open(iqtree_file) as f:
        lines = f.readlines()
        i = 0
        u_trees = []
        while(not lines[i].endswith("p-AU\n")):
            i = i + 1
        i = i + 2
        cnt = 0
        while(not lines[i].startswith("\n")):
            data = lines[i].split(" ")
            s = 0
            while(data[s] == ""):
                s = s +1
            if data[s + 2]== "=":
                #u_trees.append(int(float(data[s + 4])))
                ref = int(float(data[s + 4]))
                u_trees.append(u_trees[ref])
            else:
                #u_trees.append(int(float(data[s])))
                u_trees.append(cnt)
                cnt = cnt +1
            i = i + 1
        return u_trees


In [20]:
#print(get_clusters("/home/luise/master_thesis/output/iqtree/BIN.iqtree"))

In [24]:
import os
import shutil
from ete3 import Tree
import random
tree_dir = '../data/trees/'


def shuffle_trees(run_prefix, model, seed):
    random.seed(seed)
    tree_file = open('../data/trees/' + run_prefix + ".raxml.mlTrees", 'r')
    lines = tree_file.readlines()
    permutation = [i for i in range(len(lines))]
    random.shuffle(permutation)
    #inverse_permutaion = [0 for i in range(len(lines))]
    #for i in range(len(lines)):
    #    inverse_permutaion[permutation[i]] = i
    lines = [lines[permutation[i]] for i in range(len(lines))]  
    file_name = "temp/" + run_prefix + "_shuffled.raxml.mlTrees"
    shutil.rmtree("temp/", ignore_errors=True)
    os.mkdir("temp/")
    with open(file_name, 'w+') as tree_file2:
        tree_file2.write("\n".join(lines))
        
    os.system("./../iqtree-2.2.0-Linux/bin/iqtree2 -s ../data/language_alignments/morpho.phy  -pre temp/" + 
        run_prefix + " -m " + model + " -z "+ file_name + " -te ../data/trees/" +
        run_prefix + ".raxml.bestTree -n 0 -zb 10000 -zw -au -seed 0 -treediff > temp/bar.txt")
    res = get_iqtree_results("temp/" + run_prefix + ".iqtree")
    plausible_trees = []
    for r in res:
        if(r['plausible']):
            plausible_trees.append(Tree(lines[r['tree_id'] - 1]))
    shutil.rmtree("temp/", ignore_errors=True)
    return plausible_trees
        
    

def get_plausible_tree_set(run_prefix, model):
    shutil.rmtree("temp/", ignore_errors=True)
    os.mkdir("temp/")
    os.system("./../iqtree-2.2.0-Linux/bin/iqtree2 -s ../data/language_alignments/morpho.phy  -pre temp/" + 
        run_prefix + " -m " + model + " -z ../data/trees/" + run_prefix + ".raxml.mlTrees -te ../data/trees/" +
        run_prefix + ".raxml.bestTree -n 0 -zb 10000 -zw -au -seed 0 -treediff > temp/bar.txt")
    res = get_iqtree_results("temp/" + run_prefix + ".iqtree")
    p_count = 0
    plausible_trees = []
    for r in res:
        if(r['plausible']):
            plausible_trees.append(Tree(lines[r['tree_id'] - 1]))
    shutil.rmtree("temp/", ignore_errors=True)
    return plausible_trees
    
def get_plausible_tree_set_with_shuffle(run_prefix, model, num_permutations):
    plausible_trees = {}
    for i in range(num_permutations):
        curr_plausible_trees = shuffle_trees(run_prefix, model, i)
        for (i1 ,t1) in enumerate(curr_plausible_trees):
            found = False
            for (t2, count) in plausible_trees.items():
                if rf_distance_ete(t1, t2) == 0:
                    plausible_trees[t2] = plausible_trees[t2] + 1
                    found = True
                    break
            if not found:
                plausible_trees[t1] = 1
    plausible_trees_final = []
    tree_file = open('../data/trees/' + run_prefix + ".raxml.plausibleTrees", 'w+')
    for (t2, count) in plausible_trees.items():
        if count == num_permutations:
            plausible_trees_final.append(t2)
            tree_file.write(t2.write() + "\n")
    return plausible_trees_final
        
get_plausible_tree_set_with_shuffle("pars{100}rand{100}_bin", "GTR2+FO", 20)
get_plausible_tree_set_with_shuffle("pars{100}rand{100}", "GTR2+FO+G", 20)

[Tree node '' (0x7f6d9ae1e7a),
 Tree node '' (0x7f6d9ae1d72),
 Tree node '' (0x7f6d9ae1c5e),
 Tree node '' (0x7f6d9ae1f67),
 Tree node '' (0x7f6d9ae1e8f),
 Tree node '' (0x7f6d9ae1e65),
 Tree node '' (0x7f6d9ae1d75),
 Tree node '' (0x7f6d9af33a6),
 Tree node '' (0x7f6d9ae1f8e),
 Tree node '' (0x7f6d9ae1f6d),
 Tree node '' (0x7f6d9ae4fe2),
 Tree node '' (0x7f6d9ae1feb),
 Tree node '' (0x7f6d9ae4ca9)]

NewickError: Unexisting tree file or Malformed newick tree structure.
You may want to check other newick loading flags like 'format' or 'quoted_node_names'.