In [1]:
import os
import sys
from pathlib import Path
import numpy as np



PROJECT_ROOT = Path("..").resolve()
print("PROJECT_ROOT:", PROJECT_ROOT)

# make `src` importable
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.data.dataloader import ForgeryDataset


PROJECT_ROOT: C:\Users\piiop\Desktop\Portfolio\Projects\RecodAI_LUC


In [2]:
# Cell 2: define paths (match your cv command)
train_authentic = PROJECT_ROOT / "data" / "train_images" / "authentic"   # <- adjust if needed
train_forged    = PROJECT_ROOT / "data" / "train_images" / "forged"      # <- adjust if needed
train_masks     = PROJECT_ROOT / "data" / "train_masks"                  # <- adjust if needed

print("authentic:", train_authentic, "exists:", train_authentic.exists())
print("forged   :", train_forged,    "exists:", train_forged.exists())
print("masks    :", train_masks,     "exists:", train_masks.exists())


authentic: C:\Users\piiop\Desktop\Portfolio\Projects\RecodAI_LUC\data\train_images\authentic exists: True
forged   : C:\Users\piiop\Desktop\Portfolio\Projects\RecodAI_LUC\data\train_images\forged exists: True
masks    : C:\Users\piiop\Desktop\Portfolio\Projects\RecodAI_LUC\data\train_masks exists: True


In [4]:
# Cell 3: build dataset and inspect
ds = ForgeryDataset(
    transform=None,
    is_train=True,
)

print("Total samples:", len(ds))

# if empty, inspect why
if len(ds) == 0:
    # Quickly list what files it *would* see
    authentic_files = sorted(os.listdir(train_authentic)) if train_authentic.exists() else []
    forged_files    = sorted(os.listdir(train_forged))    if train_forged.exists() else []
    mask_files      = sorted(os.listdir(train_masks))     if train_masks.exists() else []

    print(f"#authentic files: {len(authentic_files)}")
    print(f"#forged files   : {len(forged_files)}")
    print(f"#mask files     : {len(mask_files)}")

    print("First few authentic:", authentic_files[:5])
    print("First few forged   :", forged_files[:5])
    print("First few masks    :", mask_files[:5])
else:
    # peek at first few entries
    print("First 3 samples:")
    for sample in ds.samples[:3]:
        print(sample)


Total samples: 5176
First 3 samples:
{'image_path': 'C:\\Users\\piiop\\Desktop\\Portfolio\\Projects\\RecodAI_LUC\\data\\train_images\\authentic\\10.png', 'mask_path': 'C:\\Users\\piiop\\Desktop\\Portfolio\\Projects\\RecodAI_LUC\\data\\train_masks\\10.npy', 'is_forged': False}
{'image_path': 'C:\\Users\\piiop\\Desktop\\Portfolio\\Projects\\RecodAI_LUC\\data\\train_images\\authentic\\10015.png', 'mask_path': 'C:\\Users\\piiop\\Desktop\\Portfolio\\Projects\\RecodAI_LUC\\data\\train_masks\\10015.npy', 'is_forged': False}
{'image_path': 'C:\\Users\\piiop\\Desktop\\Portfolio\\Projects\\RecodAI_LUC\\data\\train_images\\authentic\\10017.png', 'mask_path': 'C:\\Users\\piiop\\Desktop\\Portfolio\\Projects\\RecodAI_LUC\\data\\train_masks\\10017.npy', 'is_forged': False}


In [6]:
pos = sum(1 for s in ds.samples if s["is_forged"] and os.path.exists(s["mask_path"]) and np.load(s["mask_path"]).sum() > 0)
print("forged samples:", sum(s["is_forged"] for s in ds.samples))
print("forged-with-positive-mask:", pos)


forged samples: 2799
forged-with-positive-mask: 2799


In [None]:
# Check CV score with dummy submission

In [7]:
# Cell 5: dummy "all authentic" submission scored exactly like CV

import pandas as pd

from src.training.train_cv import build_solution_df
from src.models.kaggle_metric import score as kaggle_score

# Build the exact same ground-truth dataframe CV uses (row_id, annotation, shape)
solution_df, _ = build_solution_df(ds)  # ds already created above (ForgeryDataset(transform=None))

# Dummy submission: predict "authentic" for every row_id
dummy_submission = pd.DataFrame(
    {"row_id": solution_df["row_id"].values, "annotation": ["authentic"] * len(solution_df)}
)

dummy_score = kaggle_score(
    solution_df.copy(),
    dummy_submission.copy(),
    row_id_column_name="row_id",
)

print("Dummy all-authentic score (CV-aligned):", float(dummy_score))


Dummy all-authentic score (CV-aligned): 0.45923493044822256
