# Sampling Demo Notebook

This notebook demonstrates the proportional random sampling logic used in the 
LabelCheck Validation pipeline.  
For safety, the data here is **synthetic** (fake).  


In [1]:
import pandas as pd
import numpy as np

# Fake dataset of tagger annotations
np.random.seed(42)
data = pd.DataFrame({
    "taggers_company": np.random.choice(["Company_A", "Company_B", "Company_C"], size=200),
    "test_name": np.random.choice(["Test 1", "Test 2", "Test 3"], size=200),
    "patch_id": range(1, 201),
    "image_id": [f"IMG_{i}" for i in range(1, 201)],
    "annotation_item_id": np.random.randint(1000, 1100, size=200)
})

data.head()


Unnamed: 0,taggers_company,test_name,patch_id,image_id,annotation_item_id
0,Company_C,Test 1,1,IMG_1,1091
1,Company_A,Test 2,2,IMG_2,1097
2,Company_C,Test 1,3,IMG_3,1065
3,Company_C,Test 2,4,IMG_4,1031
4,Company_A,Test 2,5,IMG_5,1086


In [18]:
# Count group sizes
group_counts = data.groupby(["taggers_company", "test_name"]).size().reset_index(name="group_size")

# Total per test
test_totals = group_counts.groupby("test_name")["group_size"].sum().reset_index(name="total_test_count")

# Merge to compute proportional targets (e.g. 20 total samples)
target_n = 20
merged = group_counts.merge(test_totals, on="test_name")
merged["sample_target"] = ((merged["group_size"] / merged["total_test_count"]) * target_n).round().astype(int)
# merged["sample_target"] = merged["raw_target"].round().astype(int)

merged

Unnamed: 0,taggers_company,test_name,group_size,total_test_count,sample_target
0,Company_A,Test 1,22,69,6
1,Company_A,Test 2,26,65,8
2,Company_A,Test 3,18,66,5
3,Company_B,Test 1,14,69,4
4,Company_B,Test 2,22,65,7
5,Company_B,Test 3,25,66,8
6,Company_C,Test 1,33,69,10
7,Company_C,Test 2,17,65,5
8,Company_C,Test 3,23,66,7


In [8]:
sampled = (
    data.groupby(["taggers_company", "test_name"], group_keys=False)
        .apply(lambda x: x.sample(
            n=min(len(x), merged.loc[
                (merged["taggers_company"] == x["taggers_company"].iloc[0]) &
                (merged["test_name"] == x["test_name"].iloc[0]),
                "sample_target"
            ].iloc[0]
        )))
)

sampled.head()

  .apply(lambda x: x.sample(


Unnamed: 0,taggers_company,test_name,patch_id,image_id,annotation_item_id
40,Company_A,Test 1,41,IMG_41,1018
103,Company_A,Test 1,104,IMG_104,1095
131,Company_A,Test 1,132,IMG_132,1043
151,Company_A,Test 1,152,IMG_152,1074
144,Company_A,Test 1,145,IMG_145,1025


In [15]:
sampled.groupby(["taggers_company", "test_name"])['patch_id'].count()

taggers_company  test_name
Company_A        Test 1        6
                 Test 2        8
                 Test 3        5
Company_B        Test 1        4
                 Test 2        7
                 Test 3        8
Company_C        Test 1       10
                 Test 2        5
                 Test 3        7
Name: patch_id, dtype: int64

## Notes
- The notebook replicates the SQL sampling logic in Python.
- In the real pipeline, the query is executed in BigQuery (see `queries/weekly_sample.sql`).
- Here, we demonstrate the same logic with pandas on fake data for portfolio purposes.
