In [2]:
# Install kaggle CLI into the current notebook environment
import sys, subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "kaggle"], check=True)


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sagemaker-studio 1.1.4 requires pydynamodb>=0.7.4, which is not installed.
sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.3.3 which is incompatible.[0m[31m
[0m

CompletedProcess(args=['/opt/conda/bin/python', '-m', 'pip', 'install', '-q', 'kaggle'], returncode=0)

In [3]:
# Find the kaggle executable on PATH
import shutil
print(shutil.which("kaggle"))


/opt/conda/bin/kaggle


In [4]:
# Download + unzip dataset using kaggle CLI
from pathlib import Path
import subprocess

raw_dir = Path("data/raw")
raw_dir.mkdir(parents=True, exist_ok=True)

subprocess.run([
    "kaggle", "datasets", "download",
    "-d", "crawford/cat-dataset",
    "-p", str(raw_dir),
    "--unzip"
], check=True)

print("Done. Sample files:", list(raw_dir.iterdir())[:10])


Dataset URL: https://www.kaggle.com/datasets/crawford/cat-dataset
License(s): CC0-1.0
Downloading cat-dataset.zip to data/raw


100%|██████████| 4.04G/4.04G [00:28<00:00, 151MB/s] 



Done. Sample files: [PosixPath('data/raw/cats'), PosixPath('data/raw/CAT_00'), PosixPath('data/raw/CAT_01'), PosixPath('data/raw/CAT_02'), PosixPath('data/raw/CAT_03'), PosixPath('data/raw/CAT_04'), PosixPath('data/raw/CAT_05'), PosixPath('data/raw/CAT_06'), PosixPath('data/raw/cifar10')]


In [5]:
# 2) Parse per-image landmark files  into a single annotations CSV
#    Each file typically contains 9 landmark points (x,y) in a fixed order.

import pandas as pd
from pathlib import Path

processed_dir = Path("data/processed/cats")
processed_dir.mkdir(parents=True, exist_ok=True)

cats_images_dir = raw_dir  # adjust if unzip created subfolders
rows = []

for img_path in cats_images_dir.rglob("*.jpg"):
    # Example: annotation file might be "<image>.cat" or similar naming
    ann_path = img_path.with_suffix(img_path.suffix + ".cat")  # adjust if your files differ
    if not ann_path.exists():
        continue

    # Read and parse annotation file
    text = ann_path.read_text().strip().split()
    # Common format: first number = count, then pairs x y...
    # (You may need to adapt depending on exact file content)
    pts = list(map(float, text[1:]))  # skip count
    rows.append({"image": str(img_path), "points": pts})

cats_ann = pd.DataFrame(rows)
# Expand pts into columns x1,y1,...,x9,y9
pts_cols = [f"{axis}{i}" for i in range(1, 10) for axis in ("x", "y")]
cats_ann[pts_cols] = pd.DataFrame(cats_ann["points"].tolist(), index=cats_ann.index)
cats_ann = cats_ann.drop(columns=["points"])

cats_ann.to_csv("data/processed/cats/annotations.csv", index=False)


In [6]:
# Verify the file exists locally
from pathlib import Path

Path("data/processed/cats/annotations.csv").exists()


True

In [8]:
# Upload the locally generated annotations file to S3


import boto3

s3 = boto3.client("s3")

bucket = "sagemaker-us-east-1-549206572067"
s3_key = "cat-landmarks-project/raw/cats/annotations.csv"

s3.upload_file(
    "data/processed/cats/annotations.csv",
    bucket,
    s3_key
)


In [10]:
# 3) Download and extract CIFAR-10 python dataset
#    This produces cifar-10-batches-py with pickled batches.

import tarfile
import urllib.request
from pathlib import Path

cifar_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"  # official
cifar_tar = Path("data/raw/cifar10/cifar-10-python.tar.gz")
cifar_tar.parent.mkdir(parents=True, exist_ok=True)

urllib.request.urlretrieve(cifar_url, cifar_tar)

with tarfile.open(cifar_tar, "r:gz") as tar:
    tar.extractall(path=cifar_tar.parent)


  tar.extractall(path=cifar_tar.parent)


In [11]:
# 4) Convert CIFAR batches into PNG/JPG files and keep only non-cat classes
#    CIFAR-10 python batches store images in arrays, labels in a list.

import pickle
import numpy as np
from PIL import Image

cifar_base = Path("data/raw/cifar10/cifar-10-batches-py")
out_dir = Path("data/processed/noncats/images")
out_dir.mkdir(parents=True, exist_ok=True)

def unpickle(path):
    with open(path, "rb") as f:
        return pickle.load(f, encoding="bytes")

# Load label names
meta = unpickle(cifar_base / "batches.meta")
label_names = [x.decode("utf-8") for x in meta[b"label_names"]]

cat_label_index = label_names.index("cat")  # exclude this from negatives

batch_files = [cifar_base / f"data_batch_{i}" for i in range(1, 6)]
img_count = 0

for bf in batch_files:
    batch = unpickle(bf)
    data = batch[b"data"]          # shape: (10000, 3072)
    labels = batch[b"labels"]      # list of ints

    for i, y in enumerate(labels):
        if y == cat_label_index:
            continue  # skip cat class

        # Convert 3072 vector to (32,32,3)
        img = data[i].reshape(3, 32, 32).transpose(1, 2, 0)
        Image.fromarray(img).save(out_dir / f"cifar_{img_count:06d}.png")
        img_count += 1


In [18]:
# Create zip archives for faster S3 upload 
from pathlib import Path
import zipfile
import random
import boto3

bucket = "sagemaker-us-east-1-549206572067"
prefix = "cat-landmarks-project"
s3 = boto3.client("s3")

def zip_folder(src_dir: str, zip_path: str, glob_pattern: str = "*") -> None:
    src = Path(src_dir)
    out = Path(zip_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(out, "w", compression=zipfile.ZIP_DEFLATED) as z:
        for p in src.rglob(glob_pattern):
            if p.is_file():
                z.write(p, arcname=p.relative_to(src).as_posix())

def zip_sample(files, src_root: Path, zip_path: str, seed: int = 42) -> None:
    # Zips only a sampled subset of files (for CIFAR non-cats)
    random.seed(seed)
    out = Path(zip_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    sampled = random.sample(files, k=min(15000, len(files)))

    with zipfile.ZipFile(out, "w", compression=zipfile.ZIP_DEFLATED) as z:
        for p in sampled:
            z.write(p, arcname=p.relative_to(src_root).as_posix())

def upload_to_s3(local_path: str, s3_key: str) -> None:
    # Uploads a single large file to S3 (fast)
    s3.upload_file(local_path, bucket, s3_key)

# 1) Zip ALL Kaggle cat images (adjust folder to where your cat .jpg files actually are)
cats_images_dir = "data/raw/cats"  # change if your unzip created a subfolder
cats_zip = "data/zips/cats.zip"
zip_folder(cats_images_dir, cats_zip, glob_pattern="*.jpg")

# 2) Zip ONLY 15,000 CIFAR non-cat images (from the folder you saved them into)
noncats_dir = Path("data/processed/noncats/images")  # where you saved CIFAR non-cats
noncat_files = [p for p in noncats_dir.glob("*.png") if p.is_file()]
cifar_zip = "data/zips/cifar_noncats_15000.zip"
zip_sample(noncat_files, noncats_dir, cifar_zip)

# 3) Upload zip files
upload_to_s3(cats_zip,  f"{prefix}/raw/cats.zip")
upload_to_s3(cifar_zip, f"{prefix}/raw/cifar_noncats_15000.zip")

print("Uploaded:")
print(f"  s3://{bucket}/{prefix}/raw/cats.zip")
print(f"  s3://{bucket}/{prefix}/raw/cifar_noncats_15000.zip")


Uploaded:
  s3://sagemaker-us-east-1-549206572067/cat-landmarks-project/raw/cats.zip
  s3://sagemaker-us-east-1-549206572067/cat-landmarks-project/raw/cifar_noncats_15000.zip


In [1]:
# List buckets you can access
import boto3
s3 = boto3.client("s3")

resp = s3.list_buckets()
for b in resp.get("Buckets", []):
    print(b["Name"])


projectwa
sagemaker-studio-549206572067-xrklu309ilg
sagemaker-studio-549206572067-zyour2f075q
sagemaker-us-east-1-549206572067


In [1]:
# Check if cats.zip actually contains images
# - If count is 0 or very small, the source folder was wrong

import zipfile
from pathlib import Path

cats_zip = Path("data/zips/cats.zip")
with zipfile.ZipFile(cats_zip, "r") as z:
    jpgs = [n for n in z.namelist() if n.lower().endswith(".jpg")]
print("JPG files inside cats.zip:", len(jpgs))
print("Example entries:", jpgs[:5])


JPG files inside cats.zip: 9997
Example entries: ['CAT_00/00000001_000.jpg', 'CAT_00/00000001_005.jpg', 'CAT_00/00000001_008.jpg', 'CAT_00/00000001_011.jpg', 'CAT_00/00000001_012.jpg']


In [2]:
# Unzip raw zips locally so you can upload images to processed/
from pathlib import Path
import zipfile

local_out = Path("data/unzipped_from_s3")
local_out.mkdir(parents=True, exist_ok=True)

for zip_name in ["cats.zip", "cifar_noncats_15000.zip"]:
    zp = Path("data/zips") / zip_name
    with zipfile.ZipFile(zp, "r") as z:
        z.extractall(local_out / zip_name.replace(".zip", ""))

print("Unzipped to:", local_out)
print("Top folders:", list(local_out.iterdir()))


Unzipped to: data/unzipped_from_s3
Top folders: [PosixPath('data/unzipped_from_s3/cats'), PosixPath('data/unzipped_from_s3/cifar_noncats_15000')]


In [5]:
# Uploading unzipped images into raw/
# - Cats: all jpg
# - Noncats: png

import boto3
from pathlib import Path

s3 = boto3.client("s3")
bucket = "sagemaker-us-east-1-549206572067"
base_prefix = "cat-landmarks-project/raw"

def upload_images(src_dir: Path, s3_prefix: str, exts: tuple[str, ...]) -> int:
    count = 0
    src_dir = Path(src_dir)

    for p in src_dir.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            rel = p.relative_to(src_dir).as_posix()
            key = f"{base_prefix}/{s3_prefix}/{rel}"
            s3.upload_file(str(p), bucket, key)
            count += 1
    return count

cats_count = upload_images(local_out / "cats", "cats/images", (".jpg",)) 
noncats_count = upload_images(local_out / "cifar_noncats_15000", "noncats/images", (".png",)) 

print("Uploaded cats:", cats_count) 
print("Uploaded noncats:", noncats_count)

Uploaded cats: 9997
Uploaded noncats: 15000


In [None]:
### Combining cat and non cat with label

In [6]:
# Build a combined manifest from S3 raw images (cats + noncats)
# - Writes a single "processed manifest" you can use for Athena/EDA/training
# - Does not copy images (faster, cleaner, no extra storage)

import boto3
import pandas as pd
from datetime import datetime

bucket = "sagemaker-us-east-1-549206572067"
cats_prefix = "cat-landmarks-project/raw/cats/images/"
noncats_prefix = "cat-landmarks-project/raw/noncats/images/"

s3 = boto3.client("s3")

def list_all_keys(prefix: str) -> list[str]:
    keys = []
    token = None
    while True:
        kwargs = {"Bucket": bucket, "Prefix": prefix, "MaxKeys": 1000}
        if token:
            kwargs["ContinuationToken"] = token
        resp = s3.list_objects_v2(**kwargs)
        for obj in resp.get("Contents", []):
            k = obj["Key"]
            # Keep only images
            if k.lower().endswith((".jpg", ".png")):
                keys.append(k)
        if not resp.get("IsTruncated"):
            break
        token = resp.get("NextContinuationToken")
    return keys

cat_keys = list_all_keys(cats_prefix)
noncat_keys = list_all_keys(noncats_prefix)

rows = []
now = datetime.utcnow().isoformat()

for k in cat_keys:
    rows.append({"s3_uri": f"s3://{bucket}/{k}", "label": 1, "ingest_time": now})

for k in noncat_keys:
    rows.append({"s3_uri": f"s3://{bucket}/{k}", "label": 0, "ingest_time": now})

df = pd.DataFrame(rows)
print(df["label"].value_counts())
df.head()


label
0    15000
1     9997
Name: count, dtype: int64


  now = datetime.utcnow().isoformat()


Unnamed: 0,s3_uri,label,ingest_time
0,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664
1,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664
2,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664
3,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664
4,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664


In [8]:
# Save processed manifest to S3 (Parquet)


from pathlib import Path
import boto3
from PIL import Image
from io import BytesIO

s3 = boto3.client("s3")

bucket = "sagemaker-us-east-1-549206572067"

def get_image_metadata(s3_uri: str) -> tuple[int | None, int | None, int | None]:
    # Parse bucket/key from s3:// URI
    parts = s3_uri.replace("s3://", "").split("/", 1)
    bucket_name, key = parts[0], parts[1]

    try:
        # Get file size without downloading the object
        head = s3.head_object(Bucket=bucket_name, Key=key)
        file_size = head["ContentLength"]

        # Download image only to read dimensions
        obj = s3.get_object(Bucket=bucket_name, Key=key)
        with Image.open(BytesIO(obj["Body"].read())) as im:
            width, height = im.size

        return file_size, width, height
    except Exception:
        return None, None, None

# Apply metadata extraction
meta = df["s3_uri"].apply(get_image_metadata)
df["file_size"] = meta.apply(lambda x: x[0])
df["width"] = meta.apply(lambda x: x[1])
df["height"] = meta.apply(lambda x: x[2])


out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

combined_path = out_dir / "image_combined.parquet"
df.to_parquet(combined_path, index=False)

processed_manifest_s3_prefix = "cat-landmarks-project/processed/combined/"
s3.upload_file(
    str(combined_path),
    bucket,
    f"{processed_manifest_s3_prefix}image_combined.parquet"
)
print(" Combined cat and non cat files  saved to:")
print(f"s3://{bucket}/{processed_manifest_s3_prefix}image_combined.parquet")


 Combined cat and non cat files  saved to:
s3://sagemaker-us-east-1-549206572067/cat-landmarks-project/processed/combined/image_combined.parquet


In [9]:
df.head()

Unnamed: 0,s3_uri,label,ingest_time,file_size,width,height
0,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664,469801,375,500
1,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664,469801,500,375
2,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664,469801,500,375
3,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664,310372,500,375
4,s3://sagemaker-us-east-1-549206572067/cat-land...,1,2026-01-30T07:16:06.552664,310372,500,333


In [22]:
# Make landmarks.s3_uri match image_combined.s3_uri (full S3 URI)
# 1) Rename image -> s3_uri (if needed)
# 2) Convert 'data/raw/...' into 's3://bucket/cat-landmarks-project/raw/...'
# 3) Write parquet and upload to the annotations prefix

import pandas as pd

bucket = "sagemaker-us-east-1-549206572067"

ann = pd.read_csv("data/processed/cats/annotations.csv")

# If your CSV column is still called "image", map it
if "image" in ann.columns and "s3_uri" not in ann.columns:
    ann = ann.rename(columns={"image": "s3_uri"})

# Convert to full S3 URI so it matches image_combined.s3_uri exactly
ann["s3_uri"] = ann["s3_uri"].str.replace(
    r"^data/raw/",
    f"s3://{bucket}/cat-landmarks-project/raw/cats/images/",
    regex=True
)

# Rename landmark columns (same mapping you already used)
rename_map = {
    "x1": "left_eye_x", "y1": "left_eye_y",
    "x2": "right_eye_x","y2": "right_eye_y",
    "x3": "mouth_x",    "y3": "mouth_y",
    "x4": "left_ear_1_x","y4": "left_ear_1_y",
    "x5": "left_ear_2_x","y5": "left_ear_2_y",
    "x6": "left_ear_3_x","y6": "left_ear_3_y",
    "x7": "right_ear_1_x","y7": "right_ear_1_y",
    "x8": "right_ear_2_x","y8": "right_ear_2_y",
    "x9": "right_ear_3_x","y9": "right_ear_3_y",
}
ann = ann.rename(columns=rename_map)

# Force numeric for Athena friendliness
landmark_cols = [c for c in ann.columns if c != "s3_uri"]
ann[landmark_cols] = ann[landmark_cols].apply(pd.to_numeric, errors="coerce").astype("float64")

# Write parquet
local_parquet = "data/processed/cats/landmarks.parquet"
ann.to_parquet(local_parquet, index=False)


In [23]:
# Upload the annotations Parquet file to S3 for Athena
import boto3

s3 = boto3.client("s3")

s3.upload_file(
    "data/processed/cats/landmarks.parquet",
    "sagemaker-us-east-1-549206572067",
    "cat-landmarks-project/processed/combined/annotations/landmarks.parquet"
)


In [21]:
# Copy image_combined.parquet into a dedicated metadata/ folder (recommended layout)

import boto3

bucket = "sagemaker-us-east-1-549206572067"

src_key = "cat-landmarks-project/processed/combined/image_combined.parquet"
dst_key = "cat-landmarks-project/processed/combined/metadata/image_combined.parquet"

s3 = boto3.client("s3", region_name="us-east-1")

# Copy
s3.copy_object(
    Bucket=bucket,
    CopySource={"Bucket": bucket, "Key": src_key},
    Key=dst_key
)

# Optional: delete original after confirming copy worked
# s3.delete_object(Bucket=bucket, Key=src_key)

print("Copied to:", f"s3://{bucket}/{dst_key}")


Copied to: s3://sagemaker-us-east-1-549206572067/cat-landmarks-project/processed/combined/metadata/image_combined.parquet


In [15]:
# Project-level S3 locations (store once)

s3_bucket = "sagemaker-us-east-1-549206572067"
project_prefix = "cat-landmarks-project"

s3_raw_cats_prefix = f"s3://{s3_bucket}/{project_prefix}/raw/cats/images/"
s3_raw_noncats_prefix = f"s3://{s3_bucket}/{project_prefix}/raw/noncats/images/"
s3_processed_combined_prefix = f"s3://{s3_bucket}/{project_prefix}/processed/combined/"

ingestion_completed = True

%store s3_bucket
%store project_prefix
%store s3_raw_cats_prefix
%store s3_raw_noncats_prefix
%store s3_processed_combined_prefix
%store ingestion_completed


Stored 's3_bucket' (str)
Stored 'project_prefix' (str)
Stored 's3_raw_cats_prefix' (str)
Stored 's3_raw_noncats_prefix' (str)
Stored 's3_processed_combined_prefix' (str)
Stored 'ingestion_completed' (bool)
