# Dataset Overview

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd

from surgical_phase_tool.config_loader import load_config
from surgical_phase_tool.dataset import PHASE_TO_ID, TOOL_COLUMNS

cfg = load_config()
train_manifest = cfg['paths']['train_manifest_resolved']
test_manifest = cfg['paths']['test_manifest_resolved']
train_manifest, test_manifest

In [None]:
train_df = pd.read_csv(train_manifest)
test_df = pd.read_csv(test_manifest)
train_df.shape, test_df.shape

In [None]:
# Phase distribution (train)
phase_counts = train_df['phase'].value_counts().reindex(list(PHASE_TO_ID.keys())).fillna(0).astype(int)
phase_counts

In [None]:
plt.figure(figsize=(8, 4))
phases = list(PHASE_TO_ID.keys())
values = [phase_counts.get(p, 0) for p in phases]
plt.bar(range(len(phases)), values)
plt.xticks(range(len(phases)), phases, rotation=45, ha='right')
plt.ylabel('Frame count')
plt.title('Phase distribution (train)')
plt.tight_layout()
plt.show()

In [None]:
# Tool distribution (train)
tool_counts = {t: int((train_df[t] == 1).sum()) for t in TOOL_COLUMNS}
tool_counts

In [None]:
plt.figure(figsize=(8, 4))
tools = list(TOOL_COLUMNS)
values = [tool_counts.get(t, 0) for t in tools]
plt.bar(range(len(tools)), values)
plt.xticks(range(len(tools)), tools, rotation=45, ha='right')
plt.ylabel('Frame count (tool present)')
plt.title('Tool distribution (train)')
plt.tight_layout()
plt.show()

In [None]:
# Phase–tool co-occurrence (train)
grouped = train_df.groupby('phase')[TOOL_COLUMNS].sum().reindex(list(PHASE_TO_ID.keys()))
grouped

In [None]:
plt.figure(figsize=(8, 4))
plt.imshow(grouped.values, aspect='auto', cmap='Blues')
plt.yticks(range(len(grouped.index)), grouped.index)
plt.xticks(range(len(TOOL_COLUMNS)), TOOL_COLUMNS, rotation=45, ha='right')
plt.xlabel('Tools')
plt.ylabel('Phases')
plt.title('Phase–tool co-occurrence (train)')
plt.colorbar(label='count')
plt.tight_layout()
plt.show()