In [1]:
#Plot Distribution of Columns in training_data.parquet

In [2]:
import pandas as pd
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import math

In [3]:
from colour import Color

class color_concept:
    
    kit_green = "#009682"
    kit_orange = "#DF9B1B"
    kit_blue = "#0A64AA"
    kit_maigreen = "#8CB63C"
    kit_yellow = "#FCE500"
    kit_red = "#A22223"
    kit_lila = "#A3107C"
    kit_cyan = "#23A1E0"
    
    
    data_type = ""
    model_color = {}
    
    morph_model_colors = {
        "BIN" : kit_cyan,
        "GTR" : kit_orange,
        "MK" : kit_lila
    }
    
    morph_diff_colors = {
        "BIN_GTR" : kit_maigreen,
        "BIN_MK" : kit_blue,
        "GTR_MK" : kit_red
    }
    
    lang_model_colors = {
        "BIN" : kit_red,
        "GTR" : kit_maigreen,
        "MK" : kit_blue
    }
    
    lang_diff_colors = {
        "BIN_GTR" : kit_orange,
        "BIN_MK" : kit_lila,
        "GTR_MK" : kit_cyan
    }
    
    lang_subtype_colors = {
        "cc" : kit_lila,
        "sc" : kit_orange,
        "ms" : kit_maigreen
        #"mp" :
    }
    
    
    
    def __init__(self, data_type):
        self.data_type = data_type
        if data_type == 'morph':
            self.model_colors = self.morph_model_colors
            self.diff_colors = self.morph_diff_colors
        if data_type == 'lang':
            self.model_colors = self.lang_model_colors
            self.diff_colors = self.lang_diff_colors
    
    def get_model_color(self, model):
        return self.model_colors[model]
    
    def get_diff_color(self, model1, model2):
        #color1 = Color(self.get_model_color(model1))
        #color2 = Color(self.get_model_color(model2))
        #colors = list(color1.range_to(color2 ,5))
        #return colors[1].hex
        s = model1 + "_" + model2
        return self.diff_colors[s]
    
    def get_subtype_color(self, subtype):
        return self.lang_subtype_colors[subtype]

In [4]:
def add_treeness(df):
    treeness = []
    for index, row in df.iterrows():
        treeness.append(row['total_internal_brlens']/row['total_all_brlens'])
    df["treeness"] = treeness
    
def add_pattern_proportion(df):
    pp = []
    for index, row in df.iterrows():
        pp.append(row['num_patterns']/row['num_sites'])
    df["pattern_proportion"] = pp
    
def add_pattern_entropy(df):
    pe = []
    for index, row in df.iterrows():
        num_sites = row['num_sites']
        bollback = row['bollback']
        pe.append(-1 * (bollback + (num_sites + math.log(num_sites))))
    df["pattern_entropy"] = pe
    
def add_brlen_ratio(df):
    ratio = []
    for index, row in df.iterrows():
        m_in = row['median_internal_brlens']
        m_ex = row['median_external_brlens']
        ratio.append(m_ex/m_in)
    df["brlen_ratio"] = ratio
    
def add_additional_columns(df):
    add_treeness(df)
    add_pattern_proportion(df)
    add_pattern_entropy(df)
    add_brlen_ratio(df)

In [5]:
morph_data = pd.read_parquet("training_data/morph/binarized.parquet")
lang_data = pd.read_parquet("training_data/lang/BIN.parquet")

cc_morph = color_concept("morph")
cc_lang = color_concept("lang")

add_additional_columns(morph_data)
add_additional_columns(lang_data)


In [13]:
print(lang_data.sort_values("difficult", ascending=False)[["verbose_name", "difficult"]].to_string())


                             verbose_name  difficult
124            ratcliffearabic.BIN.cc.phy   0.956267
156             chacontukanoan.BIN.cc.phy   0.949573
68                        bdpa.BIN.cc.phy   0.896677
161  bodyobjectcolexifications.BIN.cc.phy   0.832842
122                      sails.BIN.ms.phy   0.807354
31               gravinachadic.BIN.cc.phy   0.793797
36         kesslersignificance.BIN.cc.phy   0.791602
13                    allenbai.BIN.mp.phy   0.788242
19            Nuclear-Macro-Je.BIN.cc.phy   0.778534
72          diacl_southamerica.BIN.ms.phy   0.755549
94            lionnetyotonahua.BIN.cc.phy   0.754266
211               Austronesian.BIN.cc.phy   0.747397
219            North_Halmahera.BIN.cc.phy   0.744239
91               hsiuhmongmien.BIN.cc.phy   0.743258
236               beidasinitic.BIN.cc.phy   0.734884
123              diacl_eurasia.BIN.ms.phy   0.723450
239                Lakes_Plain.BIN.cc.phy   0.716046
242                Otomanguean.BIN.cc.phy   0.

In [6]:
#for col_name in morph_data.columns: 
#    print(col_name)

In [7]:
language_data_type = []
for index, row in lang_data.iterrows():
    language_data_type.append(row['verbose_name'].split(".")[-2])
lang_data["language_data_type"] = language_data_type
grouped = lang_data.groupby(lang_data.language_data_type)
cc_lang_data = grouped.get_group("cc")
sc_lang_data = grouped.get_group("sc")
ms_lang_data = pd.concat([grouped.get_group("ms"),grouped.get_group("mp")])

In [8]:
def statistics(col):
    lang_mean = lang_data[col].mean()
    lang_median = lang_data[col].median()
    lang_std = lang_data[col].std()
    morph_mean = morph_data[col].mean()
    morph_median = morph_data[col].median()
    morph_std = morph_data[col].std()
    print("Lang: mean: " + str(lang_mean))
    print("Lang: median: " + str(lang_median))
    print("Lang: std: " + str(lang_std))
    print("Morph: mean: " + str(morph_mean))
    print("Morph: median: " + str(morph_median))
    print("Morph: std: " + str(morph_std))
    
def means_data_types(col):
    print("morph: " + str(morph_data[col].mean()))
    print("lang: " + str(lang_data[col].mean()))
    print("cc: " + str(cc_lang_data[col].mean()))
    print("sc: " + str(sc_lang_data[col].mean()))
    print("ms: " + str(ms_lang_data[col].mean()))
    

statistics("num_sites")
means_data_types("num_taxa")


Lang: mean: 1410.8287937743191
Lang: median: 634.0
Lang: std: 3449.636745180415
Morph: mean: 345.287598944591
Morph: median: 229.0
Morph: std: 340.738560005143
morph: 40.11609498680739
lang: 86.50194552529183
cc: 84.7182320441989
sc: 93.78461538461538
ms: 72.81818181818181


In [None]:
def plots_for_column(col, title, eliminate_outlier = False):
    fig = make_subplots(
        rows=2,
        cols=1,
        subplot_titles=[title, ""]
    )

    all_data = pd.concat([morph_data, lang_data], join="inner")
    if eliminate_outlier:
        lower = all_data[col].quantile(0.05)
        upper = all_data[col].quantile(0.95)
    else:
        lower = all_data[col].min()
        upper = all_data[col].max()
    
    
    fig.append_trace(
        go.Histogram(
            x=morph_data.loc[morph_data[col].between(lower, upper)][col],
            name="Biological",
            marker_color=cc_morph.get_model_color("BIN"),
            showlegend=True
            #, histnorm="percent",
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=lang_data.loc[lang_data[col].between(lower, upper)][col],
            name="Linguistic",
            marker_color=cc_lang.get_model_color("BIN"),
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    
    fig.append_trace(
        go.Histogram(
            x=cc_lang_data.loc[cc_lang_data[col].between(lower, upper)][col],
            name="Cognate",
            marker_color=cc_lang.get_subtype_color("cc"),
            showlegend=True
        ),
        row=2,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=sc_lang_data.loc[sc_lang_data[col].between(lower, upper)][col],
            name="Soundclass",
            marker_color=cc_lang.get_subtype_color("sc"),
            showlegend=True
        ),
        row=2,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=ms_lang_data.loc[ms_lang_data[col].between(lower, upper)][col],
            name="Morphological",
            marker_color=cc_lang.get_subtype_color("ms"),
            showlegend=True
        ),
        row=2,
        col=1
    )
    return fig

column = "entropy"
plots_for_column(column, "Entropy", True).update_layout(template="plotly_white", height=700)
#plots_for_column(column).write_image(column + ".png") 

In [14]:
def plots_for_column(col, title, eliminate_outlier = False):
    fig = make_subplots(
        rows=1,
        cols=1,
        #subplot_titles=[title, ""],
        x_title=title,
        y_title="Proportion of MSAs"
    )

    all_data = pd.concat([morph_data, lang_data], join="inner")
    if eliminate_outlier:
        lower = all_data[col].quantile(0.05)
        upper = all_data[col].quantile(0.95)
    else:
        lower = all_data[col].min()
        upper = all_data[col].max()
    
    
    fig.append_trace(
        go.Histogram(
            x=morph_data.loc[morph_data[col].between(lower, upper)][col],
            name="Biological",
            marker_color=cc_morph.get_model_color("BIN"),
            showlegend=True
            , histnorm="percent",
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Histogram(
            x=lang_data.loc[lang_data[col].between(lower, upper)][col],
            name="Linguistic",
            marker_color=cc_lang.get_model_color("BIN"),
            showlegend=True
            , histnorm="percent"
        ),
        row=1,
        col=1
    )
    fig.update_yaxes(dict(ticksuffix="%"))
    return fig

column = "difficult"

plots_for_column(column, "Difficulty score", True).update_layout(
    template="plotly_white", height=600)
#plots_for_column(column).write_image(column + ".png") 

In [None]:
import numpy as np
from scipy import stats

def scatter(col1, col2):
    data = []
    
    
    data.append(
        go.Scatter(
            mode = 'markers',
            x=morph_data[col1],
            y=morph_data[col2],
            name="Biological",
            marker_color=cc_morph.get_model_color("BIN"),
            showlegend=True
        )
    )
    data.append(
        go.Scatter(
            mode = 'markers',
            x=cc_lang_data[col1],
            y=cc_lang_data[col2],
            name="Cognate",
            marker_color=cc_lang.get_subtype_color("cc"),
            showlegend=True
        )
    )
    data.append(
        go.Scatter(
            mode = 'markers',
            x=sc_lang_data[col1],
            y=sc_lang_data[col2],
            name="Sound-class",
            marker_color=cc_lang.get_subtype_color("sc"),
            showlegend=True
        )
    )
    data.append(
        go.Scatter(
            mode = 'markers',
            x=ms_lang_data[col1],
            y=ms_lang_data[col2],
            name="Morphological",
            marker_color=cc_lang.get_subtype_color("ms"),
            showlegend=True
        )
    )

    df = pd.concat([morph_data, lang_data])
    corr = df[col1].corr(df[col2])
    print("\t".join([col1, col2, str(corr)]))
    print(stats.ttest_ind(df[col1], df[col2])[1])
    return go.Figure(data)


scatter("median_external_brlens", "entropy")
