# CATEGORIES PER SEQUENCE

In [None]:
import pandas as pd

df_tgt = pd.read_csv("https://github.com/mostly-ai/public-demo-data/raw/refs/heads/dev/baseball/batting.csv.gz")
df_tgt.head(2)

In [None]:
from mostlyai.qa._coherence import pull_data_for_coherence

df_tgt = pull_data_for_coherence(df_tgt=df_tgt, tgt_context_key="players_id")
df_tgt.head(2)

In [None]:
from mostlyai.qa._coherence import calculate_categories_per_sequence

categories_per_sequence_df = calculate_categories_per_sequence(df=df_tgt, context_key="players_id")
categories_per_sequence_df.head(2)

In [None]:
from mostlyai.qa._accuracy import calculate_numeric_uni_kdes

trn_num_kdes = calculate_numeric_uni_kdes(categories_per_sequence_df)
trn_num_kdes["team"]

In [None]:
from mostlyai.qa._accuracy import bin_data


cat_share_per_sequence_binned = bin_data(categories_per_sequence_df, bins=10)[0]
cat_share_per_sequence_binned.head(2)

In [None]:
from mostlyai.qa._accuracy import calculate_categorical_uni_counts


trn_bin_col_cnts = calculate_categorical_uni_counts(df=cat_share_per_sequence_binned, hash_rare_values=False)
trn_bin_col_cnts["team"]

In [None]:
from mostlyai.qa._accuracy import plot_univariate

for col in categories_per_sequence_df.columns:
    if col != "players_id":  # Skip the context key
        display(
            plot_univariate(
                col_name=col,
                trn_num_kde=trn_num_kdes.get(col),
                syn_num_kde=trn_num_kdes.get(col),
                trn_cat_col_cnts=None,
                syn_cat_col_cnts=None,
                trn_bin_col_cnts=trn_bin_col_cnts[col],
                syn_bin_col_cnts=trn_bin_col_cnts[col],
                accuracy=0.5,
            )
        )

# SEQUENCES PER CATEGORY

In [None]:
from mostlyai.qa._coherence import calculate_sequences_per_category

sequences_per_category_dict, sequences_per_category_binned_dict, cnt_sum = calculate_sequences_per_category(
    df=df_tgt, context_key="players_id"
)
display(sequences_per_category_dict["team"].head(2))
display(sequences_per_category_binned_dict["team"])

In [None]:
from mostlyai.qa._accuracy import plot_univariate

for col in categories_per_sequence_df.columns:
    if col != "players_id":  # Skip the context key
        display(
            plot_univariate(
                col_name=col,
                trn_num_kde=None,
                syn_num_kde=None,
                trn_cat_col_cnts=sequences_per_category_dict[col],
                syn_cat_col_cnts=sequences_per_category_dict[col],
                trn_bin_col_cnts=sequences_per_category_binned_dict[col],
                syn_bin_col_cnts=sequences_per_category_binned_dict[col],
                accuracy=0.5,
                trn_cnt_sum=cnt_sum,
                syn_cnt_sum=cnt_sum,
            )
        )