In [1]:
#Analysis for comparing results of morphological data in BIN and MULTI representation
#Consider the LH of the tree found with BIN alignment under the MULTI alignment and model
#and the other way around

#Distances of best trees / consensus trees

#Correlations

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from ete3 import Tree


In [10]:
def rf_distance(t1, t2):
    rf, max_rf, common_leaves, parts_t1, parts_t2,discard_t1, discart_t2 = t1.robinson_foulds(t2, unrooted_trees = True)
    if max_rf == 0:
        print("?!")
        return 0
    return rf/max_rf

def read_consensus_trees(model):
    if model == "BIN":
        d = "morph_parquets/bin/"
    elif model == "GTR":
        d = "morph_parquets/multi/GTR/"
    elif model == "MK": 
        d = "morph_parquets/multi/MK/"
    else:
        print(model + " does not exist!")
    consensus_trees = {}
    with os.scandir(d) as it:
        for entry in it:
            if not entry.is_dir():
                continue
            tree_path = os.path.join(d, os.path.join(entry.name, "consense.raxml.consensusTreeMR"))
            if not os.path.exists(tree_path):
                print("No consensus tree for " + model + " and " + entry.name)
                tree = Tree()
            else:
                tree = Tree(tree_path)
            #tree.resolve_polytomy(recursive=True)
            name = entry.name.split(".")[0] + ".phy"
            consensus_trees[name] = tree
    return consensus_trees

def rfd_consensus_trees(model1, model2):
    consensus_trees1 = read_consensus_trees(model1)
    consensus_trees2 = read_consensus_trees(model2)
    rfd_dict = {}
    for (name, tree) in consensus_trees1.items():
        if name in consensus_trees2:
            tree2 = consensus_trees2[name]
            rfd_dict[name] = rf_distance(tree1, tree2)
        else:
            rfd_dict[name] = float("nan")
            print("No consensus rf distance for " + name)
    return rfd_dict
    
    
def rfd_eval_trees(morph_data_multistate, morph_data_binarized):
    rfd_dict = {}
    for index, row in morph_data_multistate.iterrows():
        multitree = Tree(row["MULTI_newick_eval"])
        name = row['MULTI_verbose_name'].split('.')[0] + ".BIN.phy"
        bintree =  Tree(morph_data_binarized.loc[(morph_data_binarized['BIN_verbose_name'] == name)].iloc[0]["BIN_newick_eval"])
        rfd_dict[row['MULTI_verbose_name'].split('.')[0]]= rf_distance(multitree, bintree)
    return rfd_dict

def plot_rfds(rfd_dict):
    rfds = []
    for (name, rfd) in rfd_dict.items():
        rfds.append(rfd)
    plt.xlabel("RF Distance")
    plt.ylabel("Number of datasets")
    plt.hist(rfds, bins=100)
    plt.show()
    plt.xlabel("RF Distance")
    plt.ylabel("Number of datasets")
    plt.hist(rfds, bins=30)
    plt.show()

In [24]:
#Original model: Under this model a best tree was calculated with 100 tree searches
#Cross model: Now determine the likelihood of the best tree under this model

def cross_data_csv():
    out_file = open("temp/all_lhs.csv", "w+")
    out_file.write("name,original_model,cross_model,lh\n")
    lines = open("temp/lhs_GTR.csv", "r").read().split("\n")[1:-1]
    for line in lines:
        data = line.split(",")
        if data[0].endswith(".BIN.phy"):
            cross_model = "BIN"
            original_model="GTR"
            name = data[1]
        else:
            cross_model = "GTR"
            original_model="BIN"
            name = data[0]
        out_file.write(name + "," + original_model  + "," + cross_model + "," + data[3] + "\n")
    lines = open("temp/lhs_MK.csv", "r").read().split("\n")[1:-1]
    for line in lines:
        data = line.split(",")
        if data[0].endswith(".BIN.phy"):
            cross_model = "BIN"
            original_model="MK"
            name = data[1]
        else:
            cross_model = "MK"
            original_model="BIN"
            name = data[0]
        out_file.write(name + "," + original_model  + "," + cross_model + "," + data[3] + "\n")
    lines = open("temp/lhs_mk_gtr.csv", "r").read().split("\n")[1:-1]
    for line in lines:
        data = line.split(",")
        if data[2].endswith("GTR"):
            cross_model = "GTR"
            original_model="MK"
            name = data[0]
        else:
            cross_model = "MK"
            original_model="GTR"
            name = data[0]
        out_file.write(name + "," + original_model  + "," + cross_model + "," + data[3] + "\n")
        
cross_data_csv()

In [27]:
#The column cross_lh_x contains the likelihood of the best tree found under the tree model evalut
def add_cross_data(df, eval_model, tree_model):
    lhs_df = pd.read_csv("temp/all_lhs.csv")
    lhs_df = lhs_df[lhs_df["original_model"] == tree_model]
    lhs_df = lhs_df[lhs_df["cross_model"] == eval_model]
    d = {}
    for idx, row in lhs_df.iterrows():
        d[row["name"]] = float(row["lh"])
    cross_lhs = []
    diffs = []
    for idx, row in df.iterrows():
        name = row['verbose_name']
        if name in d:
            cross_lh = d[name]
            eval_lh = row["llh_eval"]
            diff = cross_lh - eval_lh
            cross_lhs.append(cross_lh)
            diffs.append(diff)
        else:
            print("For " + name + " no cross evaluation with original model " + tree_model + " and cross model " +eval_model)
            cross_lhs.append(float("nan"))
            diffs.append(float("nan"))
    df["cross_lh_" + tree_model] = cross_lhs
    df["cross_diff_" + tree_model] = diffs
    return df


def merge_dfs(morph_data_bin, morph_data_gtr, morph_data_mk):
    morph_data_bin.columns = 'BIN_' + morph_data_bin.columns.values
    morph_data_gtr.columns = 'GTR_' + morph_data_gtr.columns.values
    morph_data_mk.columns = 'MK_' + morph_data_mk.columns.values
    morph_data_bin = morph_data_bin.rename(columns={'BIN_verbose_name': 'verbose_name'})
    morph_data_gtr = morph_data_gtr.rename(columns={'GTR_verbose_name': 'verbose_name'})
    morph_data_mk = morph_data_mk.rename(columns={'MK_verbose_name': 'verbose_name'})
    df = pd.merge(morph_data_bin, morph_data_gtr, on='verbose_name', how='inner')
    df = pd.merge(df, morph_data_mk, on='verbose_name', how='inner')
    df = df[(df.BIN_cross_lh_GTR != 1) & (df.BIN_cross_lh_MK != 1) 
           & (df.GTR_cross_lh_BIN != 1) & (df.GTR_cross_lh_MK != 1)
           & (df.MK_cross_lh_BIN != 1) & (df.MK_cross_lh_GTR != 1)]
    return df


def add_rf_data(df):
    consensus_trees_bin = read_consensus_trees("BIN")
    consensus_trees_gtr = read_consensus_trees("GTR")
    consensus_trees_mk = read_consensus_trees("MK")
    consensus_dist_bin_gtr = []
    consensus_dist_bin_mk = []
    consensus_dist_gtr_mk = []
    eval_dist_bin_gtr = []
    eval_dist_bin_mk = []
    eval_dist_gtr_mk = []
    
    
    for idx, row in df.iterrows():
        name = row["verbose_name"]
        c_tree_bin = consensus_trees_bin[name]
        c_tree_gtr = consensus_trees_gtr[name]
        c_tree_mk = consensus_trees_mk[name]
        e_tree_bin = Tree(row["BIN_newick_eval"])
        e_tree_gtr = Tree(row["GTR_newick_eval"])
        e_tree_mk = Tree(row["MK_newick_eval"])
        
        consensus_dist_bin_gtr.append(rf_distance(c_tree_bin, c_tree_gtr))
        consensus_dist_bin_mk.append(rf_distance(c_tree_bin, c_tree_mk))
        consensus_dist_gtr_mk.append(rf_distance(c_tree_gtr, c_tree_mk))
        eval_dist_bin_gtr.append(rf_distance(e_tree_bin, e_tree_gtr))
        eval_dist_bin_mk.append(rf_distance(e_tree_bin, e_tree_mk))
        eval_dist_gtr_mk.append(rf_distance(e_tree_gtr, e_tree_mk))
        
    df["consensus_dist_bin_gtr"] = consensus_dist_bin_gtr
    df["consensus_dist_bin_mk"] = consensus_dist_bin_mk
    df["consensus_dist_gtr_mk"] = consensus_dist_gtr_mk
    df["eval_dist_bin_gtr"] = eval_dist_bin_gtr
    df["eval_dist_bin_mk"] = eval_dist_bin_mk
    df["eval_dist_gtr_mk"] = eval_dist_gtr_mk
    
    return df
    
morph_data_gtr = pd.read_parquet("training_data/morph_data_multistate.parquet")
morph_data_mk = pd.read_parquet("training_data/morph_data_with_tree_characteristics_mk_model.parquet")
morph_data_mk = morph_data_mk.groupby(morph_data_mk.state_type).get_group("multistate")
morph_data_bin = pd.read_parquet("training_data/morph_data_binarized.parquet")
names = []
for index, row in morph_data_bin.iterrows():
    names.append(row['verbose_name'].split('.')[0] + '.phy')
morph_data_bin['verbose_name'] = names
morph_data_gtr = add_cross_data(morph_data_gtr, "GTR", "BIN")
morph_data_gtr = add_cross_data(morph_data_gtr, "GTR", "MK")
morph_data_mk = add_cross_data(morph_data_mk, "MK", "BIN")
morph_data_mk = add_cross_data(morph_data_mk, "MK", "GTR")
morph_data_bin = add_cross_data(morph_data_bin, "BIN", "GTR")
morph_data_bin = add_cross_data(morph_data_bin, "BIN", "MK")
df = merge_dfs(morph_data_bin, morph_data_gtr, morph_data_mk)
df = add_rf_data(df)

For 20361_0.phy no cross evaluation with original model MK and cross model GTR
For 20361_0.phy no cross evaluation with original model MK and cross model BIN
No consensus tree for GTR and 12916_0.phy
No consensus tree for GTR and 10045_0.phy
No consensus tree for GTR and 170_0.phy
No consensus tree for GTR and 12880_0.phy
No consensus tree for GTR and 12895_0.phy
No consensus tree for GTR and 12893_0.phy
No consensus tree for GTR and 12931_0.phy
No consensus tree for GTR and 12527_0.phy
No consensus tree for GTR and 12936_0.phy
No consensus tree for GTR and 298_0.phy
No consensus tree for GTR and 12447_0.phy
No consensus tree for GTR and 13353_0.phy
No consensus tree for GTR and 12957_0.phy
No consensus tree for GTR and 13182_0.phy
No consensus tree for GTR and 12933_0.phy
No consensus tree for GTR and 12905_0.phy
No consensus tree for GTR and 12883_0.phy
No consensus tree for GTR and 10075_0.phy
No consensus tree for GTR and 12850_0.phy
No consensus tree for GTR and 11700_1.phy
No con

In [28]:

fig = px.scatter(df, x="BIN_cross_diff_GTR", y="GTR_cross_diff_BIN")
fig.show()
fig = px.scatter(df, x="BIN_cross_diff_MK", y="MK_cross_diff_BIN")
fig.show()
fig = px.scatter(df, x="MK_cross_diff_GTR", y="GTR_cross_diff_MK")
fig.show()

In [23]:
fig = px.scatter(df, x="BIN_llh_eval", y="BIN_cross_lh_GTR")
fig.show()
fig = px.scatter(df, x="BIN_llh_eval", y="BIN_cross_lh_MK")
fig.show()
fig = px.scatter(df, x="MK_llh_eval", y="MK_cross_lh_GTR")
fig.show()

In [29]:
fig = px.scatter(df, x="GTR_cross_lh_MK", y="GTR_cross_diff_MK")
fig.show()
fig = px.scatter(df, x="MK_cross_lh_GTR", y="MK_cross_diff_GTR")
fig.show()
fig = px.scatter(df, x="GTR_cross_diff_MK", y="MK_cross_diff_GTR")
fig.show()

In [31]:
fig = px.scatter(df, x="BIN_difficult", y="MK_difficult")
fig.show()

In [36]:
fig = px.scatter(df, x="consensus_dist_bin_gtr", y="eval_dist_bin_gtr")
fig.show()
fig = px.scatter(df, x="consensus_dist_bin_mk", y="eval_dist_bin_mk")
fig.show()
fig = px.scatter(df, x="consensus_dist_gtr_mk", y="eval_dist_gtr_mk")
fig.show()

In [38]:
from plotly import graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(
    rows=1,
    cols=1,
    subplot_titles=["RF Distances between consensus trees"]
)

    
fig.append_trace(
    go.Histogram(
        x=df['consensus_dist_bin_gtr'],
        showlegend=True
        #, histnorm="percent"
    ),
    row=1,
    col=1
)

fig.append_trace(
    go.Histogram(
        x=df['consensus_dist_bin_mk'],
        showlegend=True
        #, histnorm="percent"
    ),
    row=1,
    col=1
)

fig.append_trace(
    go.Histogram(
        x=df['consensus_dist_gtr_mk'],
        showlegend=True
        #, histnorm="percent"
    ),
    row=1,
    col=1
)
fig.update_layout(template="plotly_white", height=900)

In [39]:
from plotly import graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(
    rows=1,
    cols=1,
    subplot_titles=["RF Distances between consensus trees"]
)

    
fig.append_trace(
    go.Histogram(
        x=df['BIN_cross_diff_GTR'],
        showlegend=True
        #, histnorm="percent"
    ),
    row=1,
    col=1
)

fig.append_trace(
    go.Histogram(
        x=df['BIN_cross_diff_MK'],
        showlegend=True
        #, histnorm="percent"
    ),
    row=1,
    col=1
)

fig.append_trace(
    go.Histogram(
        x=df['GTR_cross_diff_MK'],
        showlegend=True
        #, histnorm="percent"
    ),
    row=1,
    col=1
)
fig.update_layout(template="plotly_white", height=900)

In [None]:
def split_df(df, threshold, absolute=True):
    if absolute:
        outlier_df = df[(abs(df.BIN_cross_diff) > threshold) | (abs(df.MULTI_cross_diff) > threshold)]
        remaining_df = df[(abs(df.BIN_cross_diff) <= threshold) & (abs(df.MULTI_cross_diff) <= threshold)]
    else:
        outlier_df = df[(df.BIN_cross_diff > threshold) | (df.MULTI_cross_diff > threshold)]
        remaining_df = df[(df.BIN_cross_diff <= threshold) & (df.MULTI_cross_diff <= threshold)]
    print(str(len(outlier_df.index)) + " outliers")
    for index, row in outlier_df.iterrows():
        print(row["MULTI_verbose_name"])
        print(row["MULTI_cross_diff"])
        print(row["BIN_cross_diff"])
    print(str(len(remaining_df.index)) + " remaining")
    return (outlier_df, remaining_df)
(outlier_df, remaining_df) = split_df(df, 50, True)

In [None]:

def plots_outlier(col):
    fig = make_subplots(
        rows=1,
        cols=1,
        subplot_titles=["Outlier comparison"]
    )
    
    fig.append_trace(
        go.Histogram(
            x=outlier_df[col],
            name="Outlier",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=remaining_df[col],
            name="Others",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    return fig

column = "MULTI_num_taxa"
plots_outlier(column).update_layout(template="plotly_white", height=600)
#plots_outlier(column).write_image(column + ".png") 