In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from create_plots import import_dataframe, show_stats, show_plots, quality_stats

plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower

Select the dataset name to evaluate:

In [2]:
result_files = [
    "A_10_1_results.csv",
    "A_20_2_results.csv",
    "A_30_3_results.csv",
    "A_norandom_40_1_results.csv",
    "A_norandom_40_2_results.csv",
    "A_norandom_40_3_results.csv",
    "A_norandom_40_4_results.csv"
]
#indices go from 0 to 6

filename = result_files[5]

In [3]:
print("This evaluation is for: " + filename)

This evaluation is for: A_norandom_40_3_results.csv


In [4]:
df = import_dataframe("result_files/"+filename)
filename = filename.split("_results.csv")[0]
print(df.columns)

Index(['name', 'n_taxa', 'n_trees', 'n_reticulations', 'msa_size',
       'sites_per_tree', 'sampling_type', 'simulation_type', 'celine_params',
       'brlen_scaler', 'seqgen_params', 'near_zero_branches_raxml',
       'n_equal_tree_pairs', 'true_network_weirdness', 'true_network_path',
       'fixed_reticulation_prob', 'runtime_raxml', 'inferred_network_path',
       'likelihood_type', 'brlen_linkage_type', 'start_type', 'timeout',
       'n_random_start_networks', 'n_parsimony_start_networks',
       'runtime_inference', 'use_partitioned_msa', 'n_reticulations_inferred',
       'bic_true', 'logl_true', 'bic_inferred', 'logl_inferred', 'bic_raxml',
       'logl_raxml', 'rf_absolute_raxml', 'rf_relative_raxml',
       'rf_absolute_inferred', 'rf_relative_inferred', 'msa_patterns',
       'unrooted_softwired_network_distance',
       'unrooted_hardwired_network_distance',
       'unrooted_displayed_trees_distance',
       'rooted_softwired_network_distance',
       'rooted_hardwired_ne

# Original Simulated Dataset Statistics 

In [None]:
show_stats(filename, df.query('likelihood_type == "AVERAGE"'))

Total number of datasets: 48


# Evaluation starting from raxml-ng best tree

In [None]:
df_start_from_raxml = df.query('start_type == "FROM_RAXML"')
if len(df_start_from_raxml) > 0:
    df_less_cols_average = df_start_from_raxml.query('likelihood_type == "AVERAGE"')[['n_reticulations_inferred', 'bic_diff', 'bic_diff_relative', 'aic_diff', 'aic_diff_relative', 'aicc_diff', 'aicc_diff_relative', 'logl_diff', 'logl_diff_relative', 'unrooted_softwired_network_distance', 'unrooted_hardwired_network_distance', 'unrooted_displayed_trees_distance', 'rooted_softwired_network_distance', 'rooted_hardwired_network_distance', 'rooted_displayed_trees_distance', 'rooted_tripartition_distance', 'rooted_path_multiplicity_distance', 'rooted_nested_labels_distance', 'runtime_raxml', 'runtime_inference' ]]
    print(df_less_cols_average.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).transpose().to_latex())
    
    df_less_cols_best = df_start_from_raxml.query('likelihood_type == "BEST"')[['n_reticulations_inferred', 'bic_diff', 'bic_diff_relative', 'aic_diff', 'aic_diff_relative', 'aicc_diff', 'aicc_diff_relative', 'logl_diff', 'logl_diff_relative', 'unrooted_softwired_network_distance', 'unrooted_hardwired_network_distance', 'unrooted_displayed_trees_distance', 'rooted_softwired_network_distance', 'rooted_hardwired_network_distance', 'rooted_displayed_trees_distance', 'rooted_tripartition_distance', 'rooted_path_multiplicity_distance', 'rooted_nested_labels_distance', 'runtime_raxml', 'runtime_inference' ]]
    print(df_less_cols_best.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).transpose().to_latex())
    
    quality_stats(filename+"_norandom", df_start_from_raxml)
    show_plots(filename+"_average_norandom", df_start_from_raxml.query('likelihood_type == "AVERAGE"'))
    show_plots(filename+"_best_norandom", df_start_from_raxml.query('likelihood_type == "BEST"'))
else:
    print("No data found for StartType.FROM_RAXML")

# Evaluation starting from 3 random + 3 parsimony trees

In [None]:
df_random = df.query('start_type == "RANDOM"')

if len(df_random) > 0:
    df_less_cols_average = df_random.query('likelihood_type == "AVERAGE"')[['n_reticulations_inferred', 'bic_diff', 'bic_diff_relative', 'aic_diff', 'aic_diff_relative', 'aicc_diff', 'aicc_diff_relative', 'logl_diff', 'logl_diff_relative', 'unrooted_softwired_network_distance', 'unrooted_hardwired_network_distance', 'unrooted_displayed_trees_distance', 'rooted_softwired_network_distance', 'rooted_hardwired_network_distance', 'rooted_displayed_trees_distance', 'rooted_tripartition_distance', 'rooted_path_multiplicity_distance', 'rooted_nested_labels_distance', 'runtime_inference' ]]
    print(df_less_cols_average.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).transpose().to_latex())
    
    df_less_cols_best = df_random.query('likelihood_type == "BEST"')[['n_reticulations_inferred', 'bic_diff', 'bic_diff_relative', 'aic_diff', 'aic_diff_relative', 'aicc_diff', 'aicc_diff_relative', 'logl_diff', 'logl_diff_relative', 'unrooted_softwired_network_distance', 'unrooted_hardwired_network_distance', 'unrooted_displayed_trees_distance', 'rooted_softwired_network_distance', 'rooted_hardwired_network_distance', 'rooted_displayed_trees_distance', 'rooted_tripartition_distance', 'rooted_path_multiplicity_distance', 'rooted_nested_labels_distance', 'runtime_inference' ]]
    print(df_less_cols_best.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).transpose().to_latex())
    
    quality_stats(filename+"_random", df_random)
    show_plots(filename+"_average_random", df_random.query('likelihood_type == "AVERAGE"'))
    show_plots(filename+"_best_random", df_random.query('likelihood_type == "BEST"'))
else:
    print("No data found for StartType.RANDOM")