In [None]:
import datasets
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os

In [None]:
ds_uri = 's3://sagemaker-production-eu-central-1-kl-biometric-datasets/raw_datasets/face_biometrics/hackathon_2025-07_camera_fingerprint/'
ds_uri = './hf_ds'

In [None]:
%%time
ds = datasets.load_from_disk(ds_uri)
len(ds)

In [None]:
ds[0]["img_raw"]

In [None]:
%%time
df = ds.to_pandas()
print(df.shape)
df.head(3)

In [None]:
important_columns = [
    'subject_name', 'session_folder', 'photo_name', # user / session / frame ids
    'gif_link', 'file_timestamp_within_session_ms', 'frame_number', # frame info
    'device_metadata', # json string with all the device metadata info
    'camera_device_id', 'camera_label', 'user_device_camera', # camera info parsed from device metadata
    'split', # test / train split
    'os_name', 'os_version', # os info
    'img_w', 'img_h', 'img_wh', # img info
    'img_raw', # jpg bytes of image

]

df[important_columns]

## Splitted to train/test

In [None]:
df.split.value_counts(dropna=False)

In [None]:
for s in df.split.unique():
    print(f'split=="{s}" users: {set(df[df.split == s].subject_name)}')

#### Split is done per user:

In [None]:
test_df = df[df.split == 'test']
print(f'test: {test_df.shape}, sessions: {test_df.session_folder.nunique()}, users: {test_df.subject_name.nunique()}')
train_df = df[df.split == 'train']
print(f'train: {train_df.shape}, sessions: {train_df.session_folder.nunique()}, users: {train_df.subject_name.nunique()}')

assert len(set(test_df.subject_name) & set(train_df.subject_name)) == 0

## Most efficient way how to get session

In [None]:
%%time
selected_session_folder = df.iloc[100].session_folder
idx_from_df = df[df.session_folder == selected_session_folder].index.tolist()
sess_ds = ds.select(idx_from_df)
sess_ds[0]['img_raw']

## Calculate scores matrix

In [None]:
def calculate_scores():
    rows = []
    for sess_enroll, enrl_df in tqdm(test_df.groupby('session_folder')):
        for sess_auth, auth_df in test_df.groupby('session_folder'):

            sim_score = np.random.rand() # this is your score for session similarity between sess_enroll vs sess_auth
            # if your method is symmetrics (ie enroll <> auth is the same as auth <> enroll), please cache the results, as this is looping through everything

            enrl_row = enrl_df.iloc[0]
            auth_row = auth_df.iloc[0]

            rows.append({
                'enroll_sess': sess_enroll,
                'auth_sess': sess_auth,

                'enroll_user': enrl_row.subject_name,
                'enroll_camera_label': enrl_row.camera_label,
                'enroll_user_device_camera': enrl_row.user_device_camera,

                'auth_user': auth_row.subject_name,
                'auth_camera_label': auth_row.camera_label,
                'auth_user_device_camera': auth_row.user_device_camera,

                'is_same_user': enrl_row.subject_name == auth_row.subject_name,
                'is_same_camera_label': enrl_row.camera_label == auth_row.camera_label,
                'is_same_user_device_camera': enrl_row.user_device_camera == auth_row.user_device_camera,

                'similarity': sim_score, # This is your score
            })

    return rows

pred_df = pd.DataFrame(calculate_scores())
pred_df.head(3)

