In [1]:
import altair as alt
import pandas as pd
import os
from os.path import join
alt.renderers.enable('notebook')

In [None]:
# import data
path = './plot_data'
out_path = './plot_out'
if not os.path.exists(out_path):
    os.makedirs(out_path)
    
df_dist = pd.read_csv(join(path,'type-dist.csv'))
df_co_matrix = pd.read_csv(join(path,'log_co-occur.csv'))

result_sherlock = pd.read_csv(join(path,'result_sherlock_multi-col.csv'))
result_LDA = pd.read_csv(join(path,'result_LDA_multi-col.csv'))
result_CRF = pd.read_csv(join(path,'result_CRF_multi-col.csv'))
result_CRF_LDA = pd.read_csv(join(path,'result_CRF_LDA_multi-col.csv'))

FI_sherlock = pd.read_csv(join(path,'feature_importance_single_None.csv'))
FI_LDA = pd.read_csv(join(path,'feature_importance_single_num-directstr_thr-0_tn-400.csv'))
FI_CRF = pd.read_csv(join(path,'feature_importance_CRF_None.csv'))
FI_CRF_LDA = pd.read_csv(join(path,'feature_importance_CRF_num-directstr_thr-0_tn-400.csv'))

naming = {
    'SATO':'Sato',
    'SATO_LDA':'Sato-TV', # sato with only LDA(sherlock + LDA)
    'SATO_CRF':'Sato-ST',
    'Base':'Base'
}

# Data statistics

In [None]:
alt.Chart(df_dist).mark_bar(size=8).encode(
    x = alt.X('type:O', 
              title = 'Semantic Types',
              sort=alt.EncodingSortField(
                field="count",  
                order="descending")),
    y = alt.Y('count', title='Number of Samples')    
).properties(width=800,height=200)#.save('chart.svg')

In [None]:
alt.Chart(df_co_matrix).mark_rect().encode(
    alt.X('attr1:O'),
    alt.Y('attr2:O'),
    alt.Color('log_count:Q', scale=alt.Scale(scheme='greenblue'))
).properties(width=800, height=800)#.save('co-occur-matrix.svg')

# Per-type comparisons

In [None]:
def per_type_plot(df_A, df_B, name_A, name_B):
    # produce plot that compare the per-type f1 of two apporaches.
    def melt_df(df):
        df = pd.melt(df,
                     id_vars=['type'],
                     value_vars=[name_A, name_B],
                     var_name='Model', value_name='F1')
        return df 
    
    color_scale = alt.Scale(domain=[name_A, name_B], scheme='category10')
    bar_size = 3
    col_width, col_height = 6, 100
    model_order = ['Sato', 'Sato-TV', 'Sato-ST','Base']
    
    df_A = df_A.rename({'f1-score':name_A}, axis='columns')
    df_B = df_B.rename({'f1-score':name_B}, axis='columns')
    df = pd.merge(df_A, df_B, on=['type'], suffixes=("_" + name_A, "_" + name_B))

    better = df[df[name_A] >  df[name_B]] # A better
    worse  = df[df[name_A] <  df[name_B]] # A worse
    equal  = df[df[name_A] == df[name_B]] # equal
    
    better = melt_df(better)
    worse = melt_df(worse)
    equal = melt_df(equal)
    
    chart1 = alt.Chart(better).mark_bar(size=bar_size).encode(
            y = alt.Y("F1:Q", title='F1 Score'),
            x = alt.X('Model:O', sort=model_order, axis=None),
            color = alt.Color('Model:N', sort=model_order, scale=color_scale)
        ).properties(
            width=col_width,
            height=col_height
        ).facet(column = alt.Column("type:O", 
                                    sort=alt.EncodingSortField('F1',
                                                               op='min', 
                                                               order='descending'),
                                    title=None,
                                    header=alt.Header(labelAngle=300)
                                   )
        )

    chart2 = alt.Chart(worse).mark_bar(size=bar_size).encode(
                    y = alt.Y("F1:Q", title=None, axis=None),
                    x = alt.X('Model:O', sort=model_order, title=None, axis=None),
                    color = alt.Color('Model:N', sort=model_order, scale=color_scale)
        ).properties(
            width=col_width,
            height=col_height
        ).facet(column = alt.Column("type:O", 
                                    sort=alt.EncodingSortField('F1',
                                                               op='max', 
                                                               order='descending'),
                                    title=None,
                                    header=alt.Header(labelAngle=300)
                                   ),
        )
    
    chart3 = alt.Chart(equal).mark_bar(size=bar_size).encode(
                    y = alt.Y("F1:Q", title=None, axis=None),
                    x = alt.X('Model:O', sort=model_order, title=None, axis=None),
                    color = alt.Color('Model:N', sort=model_order, scale=color_scale)
        ).properties(
            width=col_width,
            height=col_height
        ).facet(column = alt.Column("type:O", 
                                    sort=alt.EncodingSortField('F1',
                                                               op='max', 
                                                               order='descending'),
                                    title=None,
                                    header=alt.Header(labelAngle=300)
                                   ),
        )

    
    
    return alt.hconcat(chart1, chart3, chart2, spacing=0).configure_facet(spacing=0.8)

In [None]:
# per-type w/ w/o LDA
f1 = per_type_plot(result_CRF_LDA, result_CRF , naming['SATO'], naming['SATO_CRF'])  # LDA
f2 = per_type_plot(result_LDA, result_sherlock , naming['SATO_LDA'], naming['Base']) # LDA
# per-type w/ w/o CRF
f3 = per_type_plot(result_CRF_LDA, result_LDA , naming['SATO'], naming['SATO_LDA'])    # CRF
f4 = per_type_plot(result_CRF, result_sherlock , naming['SATO_CRF'], naming['Base'])   # CRF

In [None]:
f1

In [None]:
f2

In [None]:
f3

In [None]:
f4

# Feature importance

In [None]:
col_width = 40
col_height = 130

chart1 = alt.Chart(FI_sherlock).mark_bar().encode(
        x = alt.X('Metric', title = None, axis = None),
        y = alt.Y('Score', title = 'Importance score' ),
        color = alt.Color('Metric'),
).properties(
            width=col_width,
            height=col_height
        ).facet(column = alt.Column("Feature_group",
                sort=['topic', 'word', 'char', 'par', 'rest'],
                title=naming['Base'])
       ) 
chart2 = alt.Chart(FI_LDA).mark_bar().encode(
        x = alt.X('Metric', title = None, axis = None),
        y = alt.Y('Score', title = None),
        color = alt.Color('Metric'),
).properties(
            width=col_width,
            height=col_height
        ).facet(column = alt.Column("Feature_group",
                sort=['topic', 'word', 'char', 'par', 'rest'],
                title=naming['SATO_LDA'])
       )  
chart3 = alt.Chart(FI_CRF).mark_bar().encode(
        x = alt.X('Metric', title = None, axis = None),
        y = alt.Y('Score', title = 'Importance score' ),
        color = alt.Color('Metric'),
).properties(
            width=col_width,
            height=col_height
        ).facet(column = alt.Column("Feature_group",
                sort=['topic', 'word', 'char', 'par', 'rest'],
                title=naming['SATO_CRF'])
       ) 
chart4 = alt.Chart(FI_CRF_LDA).mark_bar().encode(
        x = alt.X('Metric', title = None, axis = None),
        y = alt.Y('Score', title = None ),
        color = alt.Color('Metric'),
).properties(
            width=col_width,
            height=col_height
        ).facet(column = alt.Column("Feature_group",
                sort=['topic', 'word', 'char', 'par', 'rest'],
                title=naming['SATO'])
       ) 
alt.vconcat( alt.hconcat(chart1, chart2),alt.hconcat(chart3, chart4)).configure_facet(spacing=0.8).configure_legend(title=None)