In [None]:
import datasets
import json

import os
import pandas as pd

In [None]:
%env HF_HOME ~/hf_datasets

In [None]:
ds_uri = 's3://sagemaker-production-eu-central-1-kl-biometric-datasets/raw_datasets/face_biometrics/web_internal_dc_2025-06-12/hf_dataset/'

# 4fps rotated
ds_uri = 's3://sagemaker-production-eu-central-1-kl-biometric-experiments/pipeline_data/main/lr_eval_single_model/20250703-071904/filter_fps/web_internal_dc_2025-06-12_4fps/hf_dataset/'

output_uri = 's3://sagemaker-production-eu-central-1-kl-biometric-datasets/raw_datasets/face_biometrics/hackathon_2025-07_camera_fingerprint/'

In [None]:
%%time
ds = datasets.load_from_disk(ds_uri)

In [None]:
ds[0]["img_raw"]

In [None]:
len(ds)

## Add device info

In [None]:
test_users = ['ahmed.atef@keyless.io', 'milan.sekulic@keyless.io', 'douglas.fairchild@keyless.io', 'jacopo.barto@keyless.io', 'vojtech.orgon@keyless.io', 'dario@keyless.io']


In [None]:


def process_row(row):
    dm = json.loads(row['device_metadata'])
    row["camera_label"] = dm.get('media', {}).get('device', {}).get('info', {}).get('label')
    row["camera_device_id"] = dm.get('media', {}).get('device', {}).get('info', {}).get('deviceId')

    row["user_agent"] = dm.get('navigator', {}).get('userAgent')

    parsed_ua = dm.get('navigator', {}).get('userAgentParsed', {})
    row["browser_name"] = parsed_ua.get('browser', {}).get('name')
    row["device_model"] = parsed_ua.get('device', {}).get('model')
    row["device_vendor"] = parsed_ua.get('device', {}).get('vendor')
    row["os_name"] = parsed_ua.get('os', {}).get('name')
    row["os_version"] = parsed_ua.get('os', {}).get('version')
    row["user_device_camera"] = f'{row["subject_name"]}___{row["camera_device_id"]}'

    #if row["os_name"].lower() == 'ios':
    #    row["img_raw"] = row["img_raw"].rotate(270)

    row["split"] = "test" if row["subject_name"] in test_users else "train"
    row["img_w"] = row["img_raw"].width
    row["img_h"] = row["img_raw"].height
    row["img_wh"] = f'{row["img_w"]}x{row["img_h"]}'
    return row


d = json.loads(ds[0]['device_metadata'])
d.get('media', {}).get('device', {}).get('info', {}).get('label')


In [None]:
ds = ds.map(process_row, num_proc=os.cpu_count()-1)

## Analysis

In [None]:
%%time
df = ds.to_pandas()
print(df.shape)
df.head(3)

In [None]:
print(f'unique users: {len(set(df.subject_name))}')
print(f'unique sessions: {len(set(df.session_folder))}')
print(f'unique camera labels: {len(set(df.camera_label))}')
print(f'unique camera device ids: {len(set(df.camera_device_id))}')
print(f'unique camera user devices: {len(set(df.user_device_camera))}')


In [None]:
(df
    .groupby('subject_name')
    .agg({'session_folder': 'nunique', 'photo_name': 'nunique', 'camera_device_id': 'nunique'})
    .sort_values(by=['session_folder', 'photo_name'], ascending=False)
)

## Dataset splits

In [None]:
important_columns = [
    'subject_name', 'session_folder', 'photo_name', # user / session / frame ids
    'gif_link', 'file_timestamp_within_session_ms', 'frame_number', # frame info
    'device_metadata', # json string with all the device metadata info
    'camera_device_id', 'camera_label', 'user_device_camera', # camera info parsed from device metadata
    'os_name', 'os_version', # os info
    'img_w', 'img_h', 'img_wh', # img info

]

In [None]:
df[important_columns].head(3)

In [None]:
%%time
ds.save_to_disk(output_uri)
print(f'saved to "{output_uri}"')