### Setup

In [None]:
from pathlib import Path
from collections import Counter
import itertools
import logging

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import joblib
import hydra
import os


from matplotlib import font_manager


from keypoint_extraction_pipeline.schemas.annotation import AnnotationRecord
from keypoint_extraction_pipeline.savers.json_saver import JSONSaver
from fingerspelling_trainer.training.utils.alphabets import Alphabet

plt.rcParams["figure.figsize"] = (8, 4)
plt.rcParams["axes.grid"] = True
logging.basicConfig(level=logging.INFO)

In [None]:
DATA_DIR = Path("/home/gts/projects/lruanova/projects/signamed/data/LSE/transformed/")

# Scaler stats
scaler_vel = joblib.load(os.path.join(DATA_DIR,"vel.pkl"))
scaler_kp  = joblib.load(os.path.join(DATA_DIR,"kp.pkl"))

print("Scaler (wrist-vel):")
print("Mean (vx, vy, vz):", scaler_vel.mean_)
print("Std (vx, vy, vz):", scaler_vel.scale_)
print("\nScaler (kps):")
print("Mean (x, y, z):", scaler_kp.mean_)
print("Std (x, y, z):", scaler_kp.scale_)

# Alphabet
with hydra.initialize(version_base="1.3", config_path="../config"):
    cfg = hydra.compose(config_name="config")

ALPHABET: Alphabet = hydra.utils.instantiate(
            cfg.dataset.alphabet
        )

**Helpers**

In [None]:
def load_annotation(path: Path) -> AnnotationRecord:
    return JSONSaver.load_record(path)

def duration_frames(record: AnnotationRecord) -> int:
    return len(record.frames)

def raw_label(record: AnnotationRecord) -> str:
    return record.metadata.label or ""

def signing_hand(record: AnnotationRecord) -> str:
    return (record.metadata.handness or "").lower()  # "left"/"right"/""

**Load all samples**

In [None]:
splits = ["train", "validation", "test"]
file_lists = {s: sorted((DATA_DIR / s).glob("*.json")) for s in splits}

for s, fl in file_lists.items():
    print(f"{s:<10}: {len(fl):5d} muestras – {sum(f.stat().st_size for f in fl)/1e6:.1f} MB")

### Duration of annotations (in frames)

**Plot - frames per sample distribution**

In [None]:
records = []
for split in ["train","validation","test"]:
    for json_path in sorted((DATA_DIR/ split).glob("*.json")):
        rec: AnnotationRecord = JSONSaver.load_record(json_path)
        n_frames = len(rec.frames)
        lbl = rec.metadata.label or ""
        records.append({
            "file": str(json_path),
            "split": split,
            "frames": n_frames,
            "label": lbl
        })

dur_df = pd.DataFrame(records)

In [None]:
# save
dur_df.to_parquet("dur_df.parquet")

In [None]:
# load
dur_df = pd.read_parquet("dur_df.parquet")

In [None]:
font_dir = "/home/gts/projects/lruanova/misc/fonts/times_new_roman"
for f in font_manager.findSystemFonts(fontpaths=font_dir):
    font_manager.fontManager.addfont(f)
mpl.rcParams['font.family'] = 'Times New Roman'
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

# prepare data
dur_df["len_label"] = dur_df["label"].str.len().replace(0, np.nan)
dur_df["frames_per_char"] = dur_df["frames"] / dur_df["len_label"]

def extract_person_id(path_str: str) -> str:
    fname = Path(path_str).name
    parts = fname.split("_")
    if len(parts) >= 2:
        return parts[1]  # p1, p2, ...
    else:
        return "unknown"

dur_df["person_id"] = dur_df["file"].apply(extract_person_id)
dur_df = dur_df[dur_df["person_id"] != "unknown"].copy()

# plot func
def plot_speed_by_person_ieee(split_name: str, save_path: str):
    fig, ax = plt.subplots(figsize=(7.1, 3))
    subdf = dur_df[dur_df["split"] == split_name].copy()
    order = (subdf.groupby("person_id")["frames_per_char"]
                   .median().sort_values().index.tolist())
    sns.boxplot(
        data=subdf,
        x="person_id",
        y="frames_per_char",
        order=order,
        palette="tab10",
        fliersize=2,
        ax=ax
    )
    ax.set_yscale("log")
    ax.set_yticks([1, 10])
    ax.set_yticklabels([r'$10^0$', r'$10^1$'], fontsize=10)
    ax.set_xlabel("Signer ID", fontsize=12, labelpad=4)
    ax.set_ylabel("Frames per character", fontsize=12, labelpad=4)
    ax.set_title("")
    ax.tick_params(axis='x', labelrotation=45, labelsize=10)
    ax.tick_params(axis='y', labelsize=10)
    fig.tight_layout()
    fig.patch.set_facecolor('white')
    ax.set_facecolor('white')
    fig.savefig(save_path, dpi=600, bbox_inches='tight', facecolor='white')
    plt.show()

plot_speed_by_person_ieee("train", "fig1.png")
plot_speed_by_person_ieee("validation", "fig2.png")
plot_speed_by_person_ieee("test", "fig3.png")


**Annotations per signer**

In [None]:
def extract_person_number(path_str):
    fname = Path(path_str).name
    parts = fname.split("_")
    if len(parts) >= 2 and parts[1].startswith("p"):
        return int(parts[1][1:])  # p1 → 1
    return np.nan

dur_df["person_number"] = dur_df["file"].apply(extract_person_number)
dur_df_clean = dur_df.dropna(subset=["person_number"]).copy()
dur_df_clean["person_number"] = dur_df_clean["person_number"].astype(int)

annots_per_signer = dur_df_clean["person_number"].value_counts().sort_index()

plt.figure(figsize=(15, 5))
sns.barplot(
    x=annots_per_signer.index.astype(str),
    y=annots_per_signer.values,
    color="skyblue"
)
plt.xlabel("Signer ID", fontsize=16, labelpad=10)
plt.ylabel("Number of annotations", fontsize=16, labelpad=10)
plt.title("Annotations per signer", fontsize=18, pad=15)
plt.xticks(fontsize=12, rotation=90)
plt.yticks(fontsize=14)
plt.tight_layout()
plt.show()

--------------

**Stats**

In [None]:
stats = dur_df.groupby("split")["frames"].describe()
stats

**Suggested intervals**

In [None]:
q1, q3 = dur_df["frames"].quantile([0.25, 0.75])
iqr = q3 - q1
low_lim, up_lim = q1 - 1.5*iqr, q3 + 1.5*iqr
print(f"Suggested intervals: <{low_lim:.1f} | >{up_lim:.1f} frames")

**Top shortest and largest sequences**

In [None]:
pd.set_option('display.max_colwidth', None)
shortest = dur_df.nsmallest(50, "frames")[["file", "frames"]]
longest  = dur_df.nlargest(50, "frames")[["file", "frames"]]
print("\nTop 50 shortest:")
display(shortest)
print("\nTop 50 longest:")
display(longest)

### Label analysis

**Load labels into dataframe**

In [None]:
labels = []
for split, files in file_lists.items():
    for f in tqdm(files, desc=f"Labels {split}"):
        rec = load_annotation(f)
        labels.append({
            "split": split,
            "file": f,
            "label": raw_label(rec).upper().strip(),
        })
label_df = pd.DataFrame(labels)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import font_manager
from fingerspelling_trainer.data.transformations.encode_label import EncodeLabel

font_dir = "/home/gts/projects/lruanova/misc/fonts/times_new_roman"
for f in font_manager.findSystemFonts(fontpaths=font_dir):
    font_manager.fontManager.addfont(f)
mpl.rcParams['font.family'] = 'Times New Roman'
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

# Prepare data
label_formatter = EncodeLabel(
    alphabet=ALPHABET,
    remove_non_alphabetic=False,
    collapse_repeated=False,
    include_spaces=False,
    validate=False
)

df_sym_processed = label_df.copy()
df_sym_processed['formatted_label'] = df_sym_processed['label'].apply(lambda x: label_formatter._format_label(x))
df_sym_processed['encoded_tokens'] = df_sym_processed['formatted_label'].apply(lambda x: ALPHABET.encode_label(x))
df_sym_processed['symbol_list'] = df_sym_processed['encoded_tokens'].apply(
    lambda tokens: [ALPHABET.NUM_TO_LETTER.get(token, f"UNKNOWN_TOKEN_{token}") for token in tokens]
)

df_exploded_symbols = df_sym_processed[['split', 'symbol_list']].explode('symbol_list')
sym_counts_processed = (
    df_exploded_symbols.dropna(subset=['symbol_list'])
    .groupby(['symbol_list', 'split'])
    .size()
    .reset_index(name='count')
)
sym_pivot_processed = (
    sym_counts_processed
    .pivot(index='symbol_list', columns='split', values='count')
    .fillna(0).astype(int)
)

# order (desc)
sym_pivot_processed['total'] = sym_pivot_processed.sum(axis=1)
sym_pivot_processed = sym_pivot_processed.sort_values(by='total', ascending=False)
sym_pivot_processed = sym_pivot_processed.drop(columns='total')

# Visualize
fig, ax = plt.subplots(figsize=(11, 5))
sym_pivot_processed.plot(
    kind='bar',
    stacked=True,
    ax=ax,
    color=['#3B5BA4', '#F59C3F', '#B4B4B4']
)
ax.set_ylabel('Number of occurrences', fontsize=16, labelpad=6)
ax.set_xlabel('Symbol', fontsize=16, labelpad=6)
ax.set_title('')
ax.tick_params(axis='x', labelrotation=60, labelsize=13)
ax.tick_params(axis='y', labelsize=13)
ax.legend(title='Split', fontsize=12, title_fontsize=13)
fig.tight_layout()
fig.patch.set_facecolor('white')
ax.set_facecolor('white')
fig.savefig('test.png', dpi=600, bbox_inches='tight', facecolor='white')
plt.show()


**Empty labels, with only 1 symbol and top 50 shortest**

In [None]:
label_df['len']=label_df.label.str.len()
empty=label_df[label_df.len==0]
single=label_df[label_df.len==1]
print(f'Empty: {len(empty)}')
print(f'Unique symbol: {len(single)}')
print('Top 50 shortest:')
display(label_df.nsmallest(50,'len')[['file','label','len']])

**Symbols ordered by frequency**

In [None]:
sym_counter = Counter(itertools.chain.from_iterable(label_df.label))
sym_df = pd.DataFrame(sym_counter.items(), columns=['symbol', 'count'])
sym_df = sym_df.sort_values('count', ascending=False).assign(freq=lambda d: d['count'] / d['count'].sum())
plt.figure(figsize=(10, 12))
sns.barplot(data=sym_df.head(100), y='symbol', x='count')
plt.title('Symbols order by frequency')
plt.yticks(fontsize=14)
plt.xticks(np.arange(0, sym_df['count'].max() + 250, 250))
plt.tight_layout()
plt.show()

**Pairs ordered by frequency**

In [None]:
pairs = [label[i:i+2] for label in label_df.label for i in range(len(label)-1)]
bi_counter = Counter(pairs)
bi_df = (pd.DataFrame(bi_counter.items(), columns=["digraph", "count"])\
         .sort_values("count", ascending=False))

plt.figure(figsize=(10, 20))
sns.barplot(data=bi_df.head(100), y='digraph', x='count')
plt.title('Symbols order by frequency')
plt.yticks(fontsize=14)
plt.xticks(np.arange(0, sym_df['count'].max() + 250, 250))
plt.tight_layout()
plt.show()


**Label distribution per partition**

In [None]:
df_sym = label_df.assign(symbol_list=label_df.label.apply(list)).explode('symbol_list')
sym_counts = df_sym.groupby(['symbol_list','split']).size().reset_index(name='count')
sym_pivot = sym_counts.pivot(index='symbol_list', columns='split', values='count').fillna(0).astype(int)

display(sym_pivot)

### Left/Right handed distribution

**Load data**

In [None]:
hand_stats = []
for split, files in file_lists.items():
    for f in files:
        rec = load_annotation(f)
        hand_stats.append({
            "split": split,
            "file": f,
            "hand": signing_hand(rec) or "unknown",
        })
hand_df = pd.DataFrame(hand_stats)

**Count left vs right-handed**

In [None]:
sns.countplot(data=hand_df, x="hand", hue="split")
plt.title("Dist left / right handed")
plt.show()