In [None]:
import sys
from pathlib import Path
nb_dir = Path.cwd()
candidates = [nb_dir, nb_dir.parent, nb_dir.parent.parent]
for cand in candidates:
    if (cand / 'srs_utils.py').exists():
        sys.path.insert(0, str(cand))
        break


In [None]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from srs_utils import (
    open_collection,
    build_deck_and_model_maps,
    load_reviews,
    available_decks,
    filter_by_decks,
    plot_population_distributions,
)


In [None]:
# Copy your collection to the tmp path here. Usually you can find it at:
# - macOS: ~/Library/Application Support/Anki2/User 1/collection.anki2
db_path = Path("../tmp/collection.anki2")

db_path = db_path.expanduser().resolve()
print(f'User provided: {db_path}')

# Resolve directory → collection file if needed
candidates = ('collection.anki21', 'collection.anki2', 'collection.sqlite')
src_path = db_path
if src_path.is_dir():
    for cand in candidates:
        p = src_path / cand
        if p.exists():
            src_path = p
            print(f'Resolved directory to: {src_path}')
            break

to_open = src_path

print(f'Opening: {to_open}')


In [None]:
# Open, load, and enrich reviews
con = open_collection(to_open)
deck_map, model_map = build_deck_and_model_maps(con)
df_all, df_valid = load_reviews(con, deck_map, model_map)
print(f'Rows: all={len(df_all)}, valid={len(df_valid)}')
print('Decks:', len(available_decks(df_valid)))


In [None]:
decks_sorted = available_decks(df_valid)
print(f'Available decks ({len(decks_sorted)}):')
for d in decks_sorted:
    print(' -', d)

df_sel = df_valid.copy()
import ipywidgets as widgets
from IPython.display import display
deck_selector = widgets.SelectMultiple(options=decks_sorted, description='Decks', rows=min(12, len(decks_sorted)))
apply_btn = widgets.Button(description='Apply selection', button_style='primary')
out = widgets.Output()
def _apply(_):
    sel = list(deck_selector.value) or decks_sorted
    global df_sel
    df_sel = filter_by_decks(df_valid, sel)
    with out:
        out.clear_output()
        print('Selected rows:', len(df_sel))
apply_btn.on_click(_apply)
display(deck_selector, apply_btn, out)


In [None]:
# Population-level distributions (review time, cue length, and time by correctness)
plot_population_distributions(df_sel)

In [None]:
# Per-card summary: median time, variability, and counts
by_card = (
    df_sel.groupby('cid')
    .agg(n_reviews=('cid', 'size'),
         med_time_s=('review_s', 'median'),
         mean_time_s=('review_s', 'mean'),
         std_time_s=('review_s', 'std'),
         cue_len=('cue_length', 'median'),
         deck=('deck_name', lambda s: s.mode().iloc[0] if len(s) else None))
)
by_card['cv_time'] = by_card['std_time_s'] / by_card['mean_time_s']
by_card = by_card.sort_values(['n_reviews', 'med_time_s'], ascending=[False, True])
by_card.head(5)


In [None]:
# Distributions over cards
fig, axes = plt.subplots(1, 3, figsize=(18,5))
axes[0].hist(by_card['n_reviews'], bins=50, color='#72b7b2')
axes[0].set_title('Per-card review counts')
axes[0].set_xlabel('n reviews')
axes[1].hist(by_card['med_time_s'].dropna(), bins=60, color='#e45756')
axes[1].set_title('Per-card median time (s)')
axes[1].set_xlabel('seconds')
axes[2].hist(by_card['cv_time'].replace([np.inf, -np.inf], np.nan).dropna(), bins=60, color='#54a24b')
axes[2].set_title('Per-card coefficient of variation')
axes[2].set_xlabel('std / mean')
plt.tight_layout()
plt.show()


In [None]:
AGG = 'mean'
assert AGG in ('median','mean')

def aggregate_by_card_correctness(df, agg: str = 'median'):
    # Per-card cue length: use median (stable) across reviews
    agg_cue = 'median'
    # Correct subset
    gb_c = df[df['is_correct']].groupby('cid')
    by_c = gb_c.agg(cue_len=("cue_length", agg_cue), time_s=("review_s", agg))
    # Incorrect subset
    gb_i = df[~df['is_correct']].groupby('cid')
    by_i = gb_i.agg(cue_len=("cue_length", agg_cue), time_s=("review_s", agg))
    return by_c.dropna(), by_i.dropna()

by_card_correct, by_card_incorrect = aggregate_by_card_correctness(df_sel, AGG)
print(f'Cards with correct reviews: {len(by_card_correct)}; with incorrect reviews: {len(by_card_incorrect)}')


In [None]:
# Plot aggregated: time vs cue length, segmented by correctness (card-level)
def _scatter_with_fit(x, y, label, color):
    x = np.asarray(x); y = np.asarray(y)
    plt.scatter(x, y, s=10, alpha=0.25, label=label, color=color, edgecolors='none')
    if len(x) >= 2:
        b, a = np.polyfit(x, y, 1)
        xx = np.linspace(float(x.min()), float(x.max()), 100)
        yy = a + b * xx
        plt.plot(xx, yy, color=color, linewidth=2)
        return a, b
    return 0.0, 0.0

plt.figure(figsize=(8,6))
a1,b1 = _scatter_with_fit(by_card_correct['cue_len'], by_card_correct['time_s'], 'Correct', '#54a24b')
a2,b2 = _scatter_with_fit(by_card_incorrect['cue_len'], by_card_incorrect['time_s'], 'Incorrect', '#e45756')
print(a1, b1)
print(a2, b2)
plt.xlabel('Cue length (chars)')
plt.ylabel(f'{AGG.title()} review time (s) per card')
plt.title(f'Time vs cue length by correctness (card-level, {AGG})')
plt.legend()
plt.show()
