In [2]:
#Plot Distribution of Columns in training_data.parquet

In [2]:
import pandas as pd
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import math

In [3]:
from colour import Color

class color_concept:
    
    kit_green = "#009682"
    kit_orange = "#DF9B1B"
    kit_blue = "#0A64AA"
    kit_maigreen = "#8CB63C"
    kit_yellow = "#FCE500"
    kit_red = "#A22223"
    kit_lila = "#A3107C"
    kit_cyan = "#23A1E0"
    
    
    data_type = ""
    model_color = {}
    
    morph_model_colors = {
        "BIN" : kit_cyan,
        "GTR" : kit_orange,
        "MK" : kit_lila
    }
    
    morph_diff_colors = {
        "BIN_GTR" : kit_maigreen,
        "BIN_MK" : kit_blue,
        "GTR_MK" : kit_red
    }
    
    lang_model_colors = {
        "BIN" : kit_red,
        "GTR" : kit_maigreen,
        "MK" : kit_blue
    }
    
    lang_diff_colors = {
        "BIN_GTR" : kit_orange,
        "BIN_MK" : kit_lila,
        "GTR_MK" : kit_cyan
    }
    
    lang_subtype_colors = {
        "cc" : kit_lila,
        "sc" : kit_orange,
        "ms" : kit_maigreen
        #"mp" :
    }
    
    
    
    def __init__(self, data_type):
        self.data_type = data_type
        if data_type == 'morph':
            self.model_colors = self.morph_model_colors
            self.diff_colors = self.morph_diff_colors
        if data_type == 'lang':
            self.model_colors = self.lang_model_colors
            self.diff_colors = self.lang_diff_colors
    
    def get_model_color(self, model):
        return self.model_colors[model]
    
    def get_diff_color(self, model1, model2):
        #color1 = Color(self.get_model_color(model1))
        #color2 = Color(self.get_model_color(model2))
        #colors = list(color1.range_to(color2 ,5))
        #return colors[1].hex
        s = model1 + "_" + model2
        return self.diff_colors[s]
    
    def get_subtype_color(self, subtype):
        return self.lang_subtype_colors[subtype]

In [4]:
def add_treeness(df):
    treeness = []
    for index, row in df.iterrows():
        treeness.append(row['total_internal_brlens']/row['total_all_brlens'])
    df["treeness"] = treeness
    
def add_pattern_proportion(df):
    pp = []
    for index, row in df.iterrows():
        pp.append(row['num_patterns']/row['num_sites'])
    df["pattern_proportion"] = pp
    
def add_pattern_entropy(df):
    pe = []
    for index, row in df.iterrows():
        num_sites = row['num_sites']
        bollback = row['bollback']
        pe.append(-1 * (bollback + (num_sites + math.log(num_sites))))
    df["pattern_entropy"] = pe
    
def add_brlen_ratio(df):
    ratio = []
    for index, row in df.iterrows():
        m_in = row['median_internal_brlens']
        m_ex = row['median_external_brlens']
        ratio.append(m_ex/m_in)
    df["brlen_ratio"] = ratio
    
def add_additional_columns(df):
    add_treeness(df)
    add_pattern_proportion(df)
    add_pattern_entropy(df)
    add_brlen_ratio(df)

In [5]:
morph_data = pd.read_parquet("training_data/morph/binarized.parquet")
lang_data = pd.read_parquet("training_data/lang/binary.parquet")

cc_morph = color_concept("morph")
cc_lang = color_concept("lang")

add_additional_columns(morph_data)
add_additional_columns(lang_data)


In [4]:
#for col_name in morph_data.columns: 
#    print(col_name)

In [6]:
language_data_type = []
for index, row in lang_data.iterrows():
    language_data_type.append(row['verbose_name'].split(".")[-2])
lang_data["language_data_type"] = language_data_type
grouped = lang_data.groupby(lang_data.language_data_type)
cc_lang_data = grouped.get_group("cc")
sc_lang_data = grouped.get_group("sc")
ms_lang_data = pd.concat([grouped.get_group("ms"),grouped.get_group("mp")])

In [28]:

def plots_for_column(col, title, eliminate_outlier = False):
    fig = make_subplots(
        rows=2,
        cols=1,
        subplot_titles=[title, ""]
    )

    all_data = pd.concat([morph_data, lang_data], join="inner")
    if eliminate_outlier:
        lower = all_data[col].quantile(0.05)
        upper = all_data[col].quantile(0.95)
    else:
        lower = all_data[col].min()
        upper = all_data[col].max()
    
    
    fig.append_trace(
        go.Histogram(
            x=morph_data.loc[morph_data[col].between(lower, upper)][col],
            name="Biological",
            marker_color=cc_morph.get_model_color("BIN"),
            showlegend=True
            #, histnorm="percent",
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=lang_data.loc[lang_data[col].between(lower, upper)][col],
            name="Language",
            marker_color=cc_lang.get_model_color("BIN"),
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    
    fig.append_trace(
        go.Histogram(
            x=cc_lang_data.loc[cc_lang_data[col].between(lower, upper)][col],
            name="Cognate",
            marker_color=cc_lang.get_subtype_color("cc"),
            showlegend=True
        ),
        row=2,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=sc_lang_data.loc[sc_lang_data[col].between(lower, upper)][col],
            name="Soundclass",
            marker_color=cc_lang.get_subtype_color("sc"),
            showlegend=True
        ),
        row=2,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=ms_lang_data.loc[ms_lang_data[col].between(lower, upper)][col],
            name="Morphological",
            marker_color=cc_lang.get_subtype_color("ms"),
            showlegend=True
        ),
        row=2,
        col=1
    )
    return fig

column = "median_internal_brlens"
plots_for_column(column, "Median of internal branch lengths", True).update_layout(template="plotly_white", height=700)
#plots_for_column(column).write_image(column + ".png") 