In [1]:
#Plot Distribution of Columns in training_data.parquet

In [2]:
import pandas as pd
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import math

In [3]:
def add_treeness(df):
    treeness = []
    for index, row in df.iterrows():
        treeness.append(row['total_internal_brlens']/row['total_all_brlens'])
    df["treeness"] = treeness
    
def add_pattern_proportion(df):
    pp = []
    for index, row in df.iterrows():
        pp.append(row['num_patterns']/row['num_sites'])
    df["pattern_proportion"] = pp
    
def add_pattern_entropy(df):
    pe = []
    for index, row in df.iterrows():
        num_sites = row['num_sites']
        bollback = row['bollback']
        pe.append(-1 * (bollback + (num_sites + math.log(num_sites))))
    df["pattern_entropy"] = pe
    
def add_brlen_ratio(df):
    ratio = []
    for index, row in df.iterrows():
        m_in = row['median_internal_brlens']
        m_ex = row['median_external_brlens']
        ratio.append(m_ex/m_in)
    df["brlen_ratio"] = ratio
    
def add_additional_columns(df):
    add_treeness(df)
    add_pattern_proportion(df)
    add_pattern_entropy(df)
    add_brlen_ratio(df)

In [4]:
morph_data = pd.read_parquet("training_data/morph/full_GTR.parquet")
morph_data_mk = pd.read_parquet("training_data/morph/full_MK.parquet")
morph_data_binary = pd.read_parquet("training_data/morph/binary.parquet")
morph_data_multistate = pd.read_parquet("training_data/morph/MULTI_GTR.parquet")
morph_data_binarized = pd.read_parquet("training_data/morph/binarized.parquet")
lang_data = pd.read_parquet("training_data/lang/binary.parquet")

add_additional_columns(morph_data)
add_additional_columns(morph_data_mk)
add_additional_columns(morph_data_binary)
add_additional_columns(morph_data_multistate)
add_additional_columns(morph_data_binarized)
add_additional_columns(lang_data)

morph_data_bin_full = pd.concat([morph_data_binary, morph_data_binarized], axis=0)

In [5]:
#for col_name in morph_data.columns: 
#    print(col_name)

In [5]:
grouped = morph_data_mk.groupby(morph_data_mk.state_type)
morph_data_mk_binary = grouped.get_group("binary")
morph_data_mk_multistate = grouped.get_group("multistate")

In [6]:
language_data_type = []
for index, row in lang_data.iterrows():
    language_data_type.append(row['verbose_name'].split(".")[-2])
lang_data["language_data_type"] = language_data_type
grouped = lang_data.groupby(lang_data.language_data_type)
cc_lang_data = grouped.get_group("cc")
sc_lang_data = grouped.get_group("sc")
ms_lang_data = grouped.get_group("ms")
mp_lang_data = grouped.get_group("mp")

In [8]:
def plots_for_column(col):
    fig = make_subplots(
        rows=2,
        cols=1,
        subplot_titles=["lang vs morph", "lang types"]
    )

    all_data = pd.concat([morph_data_bin_full, lang_data, cc_lang_data, sc_lang_data, ms_lang_data, mp_lang_data], join="inner")
    lower = all_data[col].quantile(0.1)
    upper = all_data[col].quantile(0.9)
    #lower = all_data[col].min()
    #upper = all_data[col].max()
    
    
    fig.append_trace(
        go.Histogram(
            x=morph_data_bin_full.loc[morph_data_bin_full[col].between(lower, upper)][col],
            name="Morph",
            marker_color="red",
            showlegend=True
            #, histnorm="percent",
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=lang_data.loc[lang_data[col].between(lower, upper)][col],
            name="Lang",
            marker_color="cornflowerblue",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    
    fig.append_trace(
        go.Histogram(
            x=cc_lang_data.loc[cc_lang_data[col].between(lower, upper)][col],
            name="Cognate",
            marker_color="midnightblue",
            showlegend=True
        ),
        row=2,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=sc_lang_data.loc[sc_lang_data[col].between(lower, upper)][col],
            name="Soundclass",
            marker_color="blue",
            showlegend=True
        ),
        row=2,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=ms_lang_data.loc[ms_lang_data[col].between(lower, upper)][col],
            name="Morpho-Syn",
            marker_color="deepskyblue",
            showlegend=True
        ),
        row=2,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=mp_lang_data.loc[mp_lang_data[col].between(lower, upper)][col],
            name="Morpho-Phon",
            marker_color="cyan",
            showlegend=True
        ),
        row=2,
        col=1
    )
    return fig

column = "num_sites"
plots_for_column(column).update_layout(template="plotly_white", height=600)
#plots_for_column(column).write_image(column + ".png") 

In [12]:
def plots_for_column_bin_multi(col):
    fig = make_subplots(
        rows=1,
        cols=1,
        subplot_titles=["multistate vs. binarized"]
    )

    all_data = pd.concat([morph_data_multistate, morph_data_binarized], join="inner")
    lower = all_data[col].quantile(0.1)
    upper = all_data[col].quantile(0.9)
    #lower = all_data[col].min()
    #upper = all_data[col].max()
    
    
    fig.append_trace(
        go.Histogram(
            x=morph_data_multistate.loc[morph_data_multistate[col].between(lower, upper)][col],
            name="multistate",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=morph_data_binarized.loc[morph_data_binarized[col].between(lower, upper)][col],
            name="binarized",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    return fig

column = "difficult"
plots_for_column_bin_multi(column).update_layout(template="plotly_white", height=600)
#plots_for_column_bin_multi(column).write_image("morpho_bin_multi_" + column + ".png") 


    

In [31]:
def plots_for_column_bin_multi_lang(col):
    fig = make_subplots(
        rows=1,
        cols=1,
        subplot_titles=[col]
    )

    all_data = pd.concat([morph_data_multistate, morph_data_binarized, lang_data], join="inner")
    lower = all_data[col].quantile(0.05)
    upper = all_data[col].quantile(0.95)
    #lower = all_data[col].min()
    #upper = all_data[col].max()
    fig.append_trace(
        go.Histogram(
            x=lang_data.loc[lang_data[col].between(lower, upper)][col],
            name="lang bin",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    
    fig.append_trace(
        go.Histogram(
            x=morph_data_multistate.loc[morph_data_multistate[col].between(lower, upper)][col],
            name="bio-morph multi",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=morph_data_binarized.loc[morph_data_binarized[col].between(lower, upper)][col],
            name="bio-morph bin",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )

    return fig

column = "num_sites"
plots_for_column_bin_multi_lang(column).update_layout(template="plotly_white", height=400)
#plots_for_column_bin_multi_lang(column).write_image("lang_morpho_bin_multi_" + column + ".png") 

In [26]:
def plots_for_column_morph_models(col):
    fig = make_subplots(
        rows=1,
        cols=1,
        subplot_titles=["Biological Morphological Data"]
    )

    gtr_df = morph_data_multistate
    mk_df = morph_data_mk_multistate
    bin_df = morph_data_binarized
    all_data = pd.concat([gtr_df, mk_df, bin_df], join="inner")
    lower = all_data[col].quantile(0.1)
    upper = all_data[col].quantile(0.9)
    #lower = all_data[col].min()
    #upper = all_data[col].max()
    
    fig.append_trace(
        go.Histogram(
            x=bin_df.loc[bin_df[col].between(lower, upper)][col],
            name="Binarized",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=gtr_df.loc[gtr_df[col].between(lower, upper)][col],
            name="Multivalue - GTR",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=mk_df.loc[mk_df[col].between(lower, upper)][col],
            name="Multivalue - MK",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    return fig

column = "num_topos_parsimony"
#plots_for_column_morph_models(column).update_layout(template="plotly_white", height=600)
plots_for_column_morph_models(column).write_image("morph_models_" + column + ".png")