# Data Exploration
- plot the distribution of labels and target variables in dataset

# Setup

In [59]:
!pip install -U kaleido

Collecting kaleido
  Using cached kaleido-0.2.1-py2.py3-none-macosx_10_11_x86_64.whl (85.2 MB)
Installing collected packages: kaleido
Successfully installed kaleido-0.2.1


In [1]:
import pandas as pd
import plotly.express as px
import pathlib

In [2]:
# CONFIGURATION
data_filepath = "./data/train_auto_annotations_UPDATED_cleaned.csv"
results_dir = "data/data_exploration_graphs/"
label_cols = ["svo_dist", "svo_dist_norm", "apv", "scv", "hv", "svo_dist_norm_disc5", "svo_dist_norm_disc10", "svo_dist_norm_disc20"]
tasks = ["regression", "regression", "classification", "classification", "classification", "classification", "classification", "classification"]
num_classes_list = [1, 1, 3, 3, 3, 5, 10, 20]

# Load Data

In [3]:
df = pd.read_csv(data_filepath, index_col=0)
df.shape

(31799, 10)

# svo_dist

In [30]:
c = "svo_dist"
fig = px.histogram(df, x=df[c])
fig = fig.update_layout({
    "title": "Subject-Verb-Object Distance Label Histogram"
},
    width=500,
    height=400
)
fig.write_image(str(pathlib.Path(results_dir, f"{c}_hist.png")))
fig

# SVO dist norm

In [4]:
c = "svo_dist_norm"
fig = px.histogram(df, x=df[c])
fig = fig.update_layout({
    "title": "Subject-Verb-Object Distance Normalized by <br>Sentence Length Label Histogram"
},  
    width=500,
    height=400,
)
fig.write_image(str(pathlib.Path(results_dir, f"{c}_hist.png")))
fig

# apv

In [28]:
c = "apv"
idx_mapping = {0: "passive voice", 1: "active voice", 2: "both"}
df_fig = df[c].value_counts().reset_index().rename(columns={"index": "Label", c: "Sample Count"})
df_fig["Label"] = df_fig["Label"].apply(lambda val: idx_mapping[val])
N = df_fig["Sample Count"].sum()
df_fig["Percentage"] = df_fig["Sample Count"].apply(lambda val: f"{(100.0 * val / N):.2f}%")
fig = px.bar(df_fig, x=df_fig["Label"], y=df_fig["Sample Count"], text=df_fig["Percentage"])
fig = fig.update_layout(
    {
        "title": "Active/Passive Voice Label Distribution"
    },
    width=500,
    height=400
)
fig.write_image(str(pathlib.Path(results_dir, f"{c}_distribution.png")))
fig

# scv

In [27]:
c = "scv"
idx_mapping = {0: "not simplest <br>form of verb(s)", 1: "simple verb(s)", 2: "both"}
df_fig = df[c].value_counts().reset_index().rename(columns={"index": "Label", c: "Sample Count"})
df_fig["Label"] = df_fig["Label"].apply(lambda val: idx_mapping[val])
N = df_fig["Sample Count"].sum()
df_fig["Percentage"] = df_fig["Sample Count"].apply(lambda val: f"{(100.0 * val / N):.2f}%")
fig = px.bar(df_fig, x=df_fig["Label"], y=df_fig["Sample Count"], text=df_fig["Percentage"])
fig = fig.update_layout(
    {
        "title": "Simplest Form of Verb Label Distribution"
    },
    width=500,
    height=400
)
# fig = fig.update_xaxes(tickangle=45)
fig.write_image(str(pathlib.Path(results_dir, f"{c}_distribution.png")))
fig

# hv

In [26]:
c = "hv"
idx_mapping = {0: "hidden verb(s)", 1: "no hidden verbs", 2: "both"}
df_fig = df[c].value_counts().reset_index().rename(columns={"index": "Label", c: "Sample Count"})
df_fig["Label"] = df_fig["Label"].apply(lambda val: idx_mapping[val])
N = df_fig["Sample Count"].sum()
df_fig["Percentage"] = df_fig["Sample Count"].apply(lambda val: f"{(100.0 * val / N):.2f}%")
fig = px.bar(df_fig, x=df_fig["Label"], y=df_fig["Sample Count"], text=df_fig["Percentage"])
fig = fig.update_layout(
    {
        "title": "Hidden Verbs Label Distribution"
    },
    width=500,
    height=400
)
fig.write_image(str(pathlib.Path(results_dir, f"{c}_distribution.png")))
fig