## Feature Engineering

#### Adding following derived features:
#### aspect ratio Aspect ratio to get shape information
#### overall area of image
#### log file size for reducing skewness
#### resolution_bucket -bucketing  images based on their largest dimension to capture resolution differences without resizing

In [19]:
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

s3 = boto3.client("s3")

In [20]:

database_name = "cat_image_analysis"
table_name = "image_landmarks_features"

bucket = "sagemaker-us-east-1-549206572067"
region = "us-east-1"

s3_combined_location = (
    f"s3://{bucket}/cat-landmarks-project/processed/combined/image_landmarks_features/"
)

s3_staging_dir = f"s3://{bucket}/athena/staging/"


In [21]:
from pyathena import connect
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [22]:
from pyathena import connect

conn = connect(
    region_name=region,
    s3_staging_dir=s3_staging_dir
)


# Create cursor
cursor = conn.cursor()



In [23]:
import pandas as pd
statement = f"""
SELECT *
FROM {database_name}.{table_name}
"""
print(statement)

df = pd.read_sql(statement, conn)
df.head(5)



SELECT *
FROM cat_image_analysis.image_landmarks_features



  df = pd.read_sql(statement, conn)


Unnamed: 0,image_id,label,file_size,width,height,aspect_ratio,area,log_file_size,eye_center_x_norm,eye_center_y_norm,eye_dist_norm,eye_y_diff_norm,eye_angle,mouth_x_norm,mouth_y_norm,mouth_eye_y_norm
0,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,375,500,0.75,187500,13.060067,0.552,0.322,0.170667,0.004,0.03124,0.530667,0.398,0.678
1,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,500,375,1.333333,187500,13.060067,0.249,0.297333,0.114,0.082667,0.498117,0.206,0.389333,0.702667
2,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,500,375,1.333333,187500,13.060067,0.658,0.594667,0.044,0.005333,0.09066,0.654,0.634667,0.405333
3,s3://sagemaker-us-east-1-549206572067/cat-land...,1,310372,500,375,1.333333,187500,12.64553,0.409,0.477333,0.15,0.032,0.158655,0.382,0.658667,0.522667
4,s3://sagemaker-us-east-1-549206572067/cat-land...,1,310372,500,333,1.501502,166500,12.64553,0.294,0.364865,0.128,0.003003,-0.015624,0.266,0.507508,0.635135


In [24]:
import numpy as np
from pathlib import Path
import hashlib

df_features = df.copy()


out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

fe_path = out_dir / "image_features.parquet"
df_features.to_parquet(fe_path, index=False)

In [25]:
import numpy as np


def resolution_bucket(row):
    max_dim = max(row["width"], row["height"])
    if max_dim < 128:
        return "small"
    elif max_dim <= 512:
        return "medium"
    else:
        return "large"




In [26]:
from datetime import datetime, timezone

df_features["event_time"] = datetime.now(timezone.utc).timestamp()

np.random.seed(42)
df_features["dataset_split"] = np.random.choice(
    ["train", "val", "test", "prod"],
    size=len(df_features),
    p=[0.4, 0.1, 0.1, 0.4]
)
# Make event_time a string before writing parquet
df_features["event_time"] = df_features["event_time"].astype("string")


In [27]:
df_features.head(5)

Unnamed: 0,image_id,label,file_size,width,height,aspect_ratio,area,log_file_size,eye_center_x_norm,eye_center_y_norm,eye_dist_norm,eye_y_diff_norm,eye_angle,mouth_x_norm,mouth_y_norm,mouth_eye_y_norm,event_time,dataset_split
0,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,375,500,0.75,187500,13.060067,0.552,0.322,0.170667,0.004,0.03124,0.530667,0.398,0.678,1769984872.178756,train
1,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,500,375,1.333333,187500,13.060067,0.249,0.297333,0.114,0.082667,0.498117,0.206,0.389333,0.702667,1769984872.178756,prod
2,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,500,375,1.333333,187500,13.060067,0.658,0.594667,0.044,0.005333,0.09066,0.654,0.634667,0.405333,1769984872.178756,prod
3,s3://sagemaker-us-east-1-549206572067/cat-land...,1,310372,500,375,1.333333,187500,12.64553,0.409,0.477333,0.15,0.032,0.158655,0.382,0.658667,0.522667,1769984872.178756,test
4,s3://sagemaker-us-east-1-549206572067/cat-land...,1,310372,500,333,1.501502,166500,12.64553,0.294,0.364865,0.128,0.003003,-0.015624,0.266,0.507508,0.635135,1769984872.178756,train


In [28]:
df_features.to_parquet(fe_path, index=False)

check = pd.read_parquet(fe_path)
print(check.columns.tolist())   # You should see aspect_ratio, area, log_file_size, resolution_bucket, event_time, dataset_split
check.head()

['image_id', 'label', 'file_size', 'width', 'height', 'aspect_ratio', 'area', 'log_file_size', 'eye_center_x_norm', 'eye_center_y_norm', 'eye_dist_norm', 'eye_y_diff_norm', 'eye_angle', 'mouth_x_norm', 'mouth_y_norm', 'mouth_eye_y_norm', 'event_time', 'dataset_split']


Unnamed: 0,image_id,label,file_size,width,height,aspect_ratio,area,log_file_size,eye_center_x_norm,eye_center_y_norm,eye_dist_norm,eye_y_diff_norm,eye_angle,mouth_x_norm,mouth_y_norm,mouth_eye_y_norm,event_time,dataset_split
0,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,375,500,0.75,187500,13.060067,0.552,0.322,0.170667,0.004,0.03124,0.530667,0.398,0.678,1769984872.178756,train
1,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,500,375,1.333333,187500,13.060067,0.249,0.297333,0.114,0.082667,0.498117,0.206,0.389333,0.702667,1769984872.178756,prod
2,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,500,375,1.333333,187500,13.060067,0.658,0.594667,0.044,0.005333,0.09066,0.654,0.634667,0.405333,1769984872.178756,prod
3,s3://sagemaker-us-east-1-549206572067/cat-land...,1,310372,500,375,1.333333,187500,12.64553,0.409,0.477333,0.15,0.032,0.158655,0.382,0.658667,0.522667,1769984872.178756,test
4,s3://sagemaker-us-east-1-549206572067/cat-land...,1,310372,500,333,1.501502,166500,12.64553,0.294,0.364865,0.128,0.003003,-0.015624,0.266,0.507508,0.635135,1769984872.178756,train


In [29]:
# Using a versioned prefix so Athena reads ONLY one schema-consistent dataset
fe_s3_prefix = "cat-landmarks-project/processed/features/v2/"
s3_key = f"{fe_s3_prefix}image_features.parquet"

s3.upload_file(str(fe_path), bucket, s3_key)
features_location = f"s3://{bucket}/{fe_s3_prefix}"


In [30]:
# Comment: Reset table to avoid schema/location confusion
cursor.execute(f"DROP TABLE IF EXISTS {database_name}.image_landmarks_features")


<pyathena.cursor.Cursor at 0x7f86c5bb9100>

In [32]:
# Registering Athena table for engineered features

features_location = f"s3://{bucket}/{fe_s3_prefix}"

create_features_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.image_landmarks_features (
    image_id          STRING,
    label             BIGINT,
    file_size         BIGINT,
    width             INT,
    height            INT,
    aspect_ratio      DOUBLE,
    area              BIGINT,
    log_file_size     DOUBLE,
    eye_center_x_norm DOUBLE,
    eye_center_y_norm DOUBLE,
    eye_dist_norm     DOUBLE,
    eye_y_diff_norm   DOUBLE,
    eye_angle         DOUBLE,
    mouth_x_norm      DOUBLE,
    mouth_y_norm      DOUBLE,
    mouth_eye_y_norm  DOUBLE,
    resolution_bucket STRING,
    event_time        STRING,
    dataset_split     STRING
    
    
)
STORED AS PARQUET
LOCATION '{features_location}'
"""

cursor.execute(create_features_table)

<pyathena.cursor.Cursor at 0x7f86c5bb9100>

In [33]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,image_combined
1,image_features
2,image_landmarks_features
3,landmarks


## Feature Store

In [34]:
import boto3
import sagemaker

original_boto3_version = boto3.__version__
%pip install 'boto3>1.17.21'

Note: you may need to restart the kernel to use updated packages.


In [35]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [36]:
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "cat-landmarks-project/feature-store"
print(default_s3_bucket_name)


sagemaker-us-east-1-549206572067


In [37]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

arn:aws:iam::549206572067:role/service-role/AmazonSageMaker-ExecutionRole-20260128T205128


In [38]:
database_name = "cat_image_analysis"
table_name = "image_landmarks_features"

bucket = "sagemaker-us-east-1-549206572067"
region = "us-east-1"

In [39]:
# -----------------------------------------
# Load engineered features from Athena table
# -----------------------------------------
import pandas as pd

query = f"""
SELECT
image_id,
label,
file_size,
width,
height,
aspect_ratio,
area,
log_file_size,
eye_center_x_norm,
eye_center_y_norm,
eye_dist_norm,
eye_y_diff_norm,
eye_angle,
mouth_x_norm,
mouth_y_norm,
mouth_eye_y_norm,
resolution_bucket,
event_time,
dataset_split
FROM {database_name}.{table_name}
"""

df_catlm_fs = pd.read_sql(query, conn)

df_catlm_fs["event_time"] = df_catlm_fs["event_time"].astype("float64")

df_catlm_fs.head(5)


  df_catlm_fs = pd.read_sql(query, conn)


Unnamed: 0,image_id,label,file_size,width,height,aspect_ratio,area,log_file_size,eye_center_x_norm,eye_center_y_norm,eye_dist_norm,eye_y_diff_norm,eye_angle,mouth_x_norm,mouth_y_norm,mouth_eye_y_norm,resolution_bucket,event_time,dataset_split
0,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,375,500,0.75,187500,13.060067,0.552,0.322,0.170667,0.004,0.03124,0.530667,0.398,0.678,,1769985000.0,train
1,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,500,375,1.333333,187500,13.060067,0.249,0.297333,0.114,0.082667,0.498117,0.206,0.389333,0.702667,,1769985000.0,prod
2,s3://sagemaker-us-east-1-549206572067/cat-land...,1,469801,500,375,1.333333,187500,13.060067,0.658,0.594667,0.044,0.005333,0.09066,0.654,0.634667,0.405333,,1769985000.0,prod
3,s3://sagemaker-us-east-1-549206572067/cat-land...,1,310372,500,375,1.333333,187500,12.64553,0.409,0.477333,0.15,0.032,0.158655,0.382,0.658667,0.522667,,1769985000.0,test
4,s3://sagemaker-us-east-1-549206572067/cat-land...,1,310372,500,333,1.501502,166500,12.64553,0.294,0.364865,0.128,0.003003,-0.015624,0.266,0.507508,0.635135,,1769985000.0,train


## Ingest Data into FeatureStore

In [62]:
df_catlm_fs.count()

image_id             24997
label                24997
file_size            24997
width                24997
height               24997
aspect_ratio         24997
area                 24997
log_file_size        24997
eye_center_x_norm     9997
eye_center_y_norm     9997
eye_dist_norm         9997
eye_y_diff_norm       9997
eye_angle             9997
mouth_x_norm          9997
mouth_y_norm          9997
mouth_eye_y_norm      9997
resolution_bucket    24997
event_time           24997
dataset_split        24997
dtype: int64

In [72]:
from time import gmtime, strftime, sleep

combined_landmark_feature_group_name = "combined-landmark-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [73]:
from sagemaker.feature_store.feature_group import FeatureGroup

combined_landmark_feature_group = FeatureGroup(
    name=combined_landmark_feature_group_name, sagemaker_session=feature_store_session
)

In [74]:
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


record_identifier_feature_name = "image_id"
event_time_feature_name = "event_time"

# ---- sanity checks (do these once)
if record_identifier_feature_name not in df_catlm_fs.columns:
    raise ValueError("image_id is missing. Create it before Feature Store ingestion.")

if df_catlm_fs[record_identifier_feature_name].isna().any():
    raise ValueError("image_id contains nulls. Feature Store record identifier cannot be null.")

if not df_catlm_fs[record_identifier_feature_name].is_unique:
    raise ValueError("image_id must be unique. You have duplicates (would overwrite records).")

# ---- cast object dtype -> pandas string dtype
cast_object_to_string(df_catlm_fs)

# ---- load feature definitions (schema inference)
combined_landmark_feature_group.load_feature_definitions(data_frame=df_catlm_fs)




[FeatureDefinition(feature_name='image_id', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='label', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='file_size', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='width', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='height', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='aspect_ratio', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='area', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='log_file_size', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='eye_c

In [66]:
df_catlm_fs["event_time"].dtype
df_catlm_fs["event_time"].head()


0    1.769985e+09
1    1.769985e+09
2    1.769985e+09
3    1.769985e+09
4    1.769985e+09
Name: event_time, dtype: float64

In [67]:
df_catlm_fs.count()

image_id             24997
label                24997
file_size            24997
width                24997
height               24997
aspect_ratio         24997
area                 24997
log_file_size        24997
eye_center_x_norm     9997
eye_center_y_norm     9997
eye_dist_norm         9997
eye_y_diff_norm       9997
eye_angle             9997
mouth_x_norm          9997
mouth_y_norm          9997
mouth_eye_y_norm      9997
resolution_bucket    24997
event_time           24997
dataset_split        24997
dtype: int64

In [75]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


combined_landmark_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)


wait_for_feature_group_creation_complete(feature_group=combined_landmark_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup combined-landmark-feature-group-01-23-17-25 successfully created.


In [76]:
combined_landmark_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:549206572067:feature-group/combined-landmark-feature-group-01-23-17-25',
 'FeatureGroupName': 'combined-landmark-feature-group-01-23-17-25',
 'RecordIdentifierFeatureName': 'image_id',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'image_id', 'FeatureType': 'String'},
  {'FeatureName': 'label', 'FeatureType': 'Integral'},
  {'FeatureName': 'file_size', 'FeatureType': 'Integral'},
  {'FeatureName': 'width', 'FeatureType': 'Integral'},
  {'FeatureName': 'height', 'FeatureType': 'Integral'},
  {'FeatureName': 'aspect_ratio', 'FeatureType': 'Fractional'},
  {'FeatureName': 'area', 'FeatureType': 'Integral'},
  {'FeatureName': 'log_file_size', 'FeatureType': 'Fractional'},
  {'FeatureName': 'eye_center_x_norm', 'FeatureType': 'Fractional'},
  {'FeatureName': 'eye_center_y_norm', 'FeatureType': 'Fractional'},
  {'FeatureName': 'eye_dist_norm', 'FeatureType': 'Fractional'},
  {'FeatureName': 'eye_y_diff_norm', '

In [77]:
combined_landmark_feature_group.ingest(data_frame=df_catlm_fs, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='combined-landmark-feature-group-01-23-17-25', feature_definitions={'image_id': {'FeatureName': 'image_id', 'FeatureType': 'String'}, 'label': {'FeatureName': 'label', 'FeatureType': 'Integral'}, 'file_size': {'FeatureName': 'file_size', 'FeatureType': 'Integral'}, 'width': {'FeatureName': 'width', 'FeatureType': 'Integral'}, 'height': {'FeatureName': 'height', 'FeatureType': 'Integral'}, 'aspect_ratio': {'FeatureName': 'aspect_ratio', 'FeatureType': 'Fractional'}, 'area': {'FeatureName': 'area', 'FeatureType': 'Integral'}, 'log_file_size': {'FeatureName': 'log_file_size', 'FeatureType': 'Fractional'}, 'eye_center_x_norm': {'FeatureName': 'eye_center_x_norm', 'FeatureType': 'Fractional'}, 'eye_center_y_norm': {'FeatureName': 'eye_center_y_norm', 'FeatureType': 'Fractional'}, 'eye_dist_norm': {'FeatureName': 'eye_dist_norm', 'FeatureType': 'Fractional'}, 'eye_y_diff_norm': {'FeatureName': 'eye_y_diff_norm', 'FeatureType': 'Fractional'}, 'eye_ang

In [78]:
#  Verifying

record_identifier_value = "s3://sagemaker-us-east-1-549206572067/cat-landmarks-project/raw/cats/images/CAT_00/00000001_000.jpg"

fg_name = combined_landmark_feature_group.name  

response = featurestore_runtime.get_record(
    FeatureGroupName=fg_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

print(response["Record"])



[{'FeatureName': 'image_id', 'ValueAsString': 's3://sagemaker-us-east-1-549206572067/cat-landmarks-project/raw/cats/images/CAT_00/00000001_000.jpg'}, {'FeatureName': 'label', 'ValueAsString': '1'}, {'FeatureName': 'file_size', 'ValueAsString': '469801'}, {'FeatureName': 'width', 'ValueAsString': '375'}, {'FeatureName': 'height', 'ValueAsString': '500'}, {'FeatureName': 'aspect_ratio', 'ValueAsString': '0.75'}, {'FeatureName': 'area', 'ValueAsString': '187500'}, {'FeatureName': 'log_file_size', 'ValueAsString': '13.060066608328581'}, {'FeatureName': 'eye_center_x_norm', 'ValueAsString': '0.552'}, {'FeatureName': 'eye_center_y_norm', 'ValueAsString': '0.322'}, {'FeatureName': 'eye_dist_norm', 'ValueAsString': '0.17066666666666666'}, {'FeatureName': 'eye_y_diff_norm', 'ValueAsString': '0.0040000000000000036'}, {'FeatureName': 'eye_angle', 'ValueAsString': '0.031239833430268277'}, {'FeatureName': 'mouth_x_norm', 'ValueAsString': '0.5306666666666666'}, {'FeatureName': 'mouth_y_norm', 'Value

In [79]:
# Inspect the actual distinct values (and NULLs)
query_string = f"""
SELECT dataset_split, count(*) AS n
FROM "{combined_query.database}"."{combined_query.table_name}"
GROUP BY dataset_split
ORDER BY n DESC
"""
combined_query.run(query_string=query_string, output_location="s3://sagemaker-us-east-1-549206572067/athena-results/")
combined_query.wait()
combined_query.as_dataframe()


Unnamed: 0,dataset_split,n
0,prod,20030
1,train,19826
2,test,5088
3,val,5050


## Data Prepocessing

### Data  Resizing,  Augmentation and Normalization
#### Augmentation is applied on-the-fly only to positive samples (label=1) during training, while resizing and normalization are applied to all samples to ensure consistent input shape and scale

In [80]:
import boto3
from PIL import Image
import io

s3 = boto3.client("s3")

def read_image_from_s3(s3_uri: str) -> Image.Image:
    bucket, key = s3_uri.replace("s3://", "").split("/", 1)
    obj = s3.get_object(Bucket=bucket, Key=key)
    return Image.open(io.BytesIO(obj["Body"].read())).convert("RGB")

In [81]:
# Comments:
# - This must be run at the start of every new notebook/session
# - Creates the SageMaker session object used by Feature Store

import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
region = boto3.Session().region_name

# Get execution role (works inside SageMaker notebooks)
role = sagemaker.get_execution_role()

print("Region:", region)
print("Role:", role)


Region: us-east-1
Role: arn:aws:iam::549206572067:role/service-role/AmazonSageMaker-ExecutionRole-20260128T205128


In [87]:
from sagemaker.feature_store.feature_group import FeatureGroup

combined_landmark_feature_group = FeatureGroup(
    name="combined_landmark_feature_group",
    sagemaker_session=sagemaker_session
)


In [88]:

from sagemaker.feature_store.feature_group import FeatureGroup
import sagemaker

sagemaker_session = sagemaker.Session()

combined_landmark_feature_group = FeatureGroup(
    name= combined_landmark_feature_group_name,
    sagemaker_session=sagemaker_session
)

desc = combined_landmark_feature_group.describe()
print(desc["FeatureGroupStatus"])



Created


In [89]:
# Comments:
# - Lists existing Feature Groups in your account/region
# - Helps you find the exact FeatureGroupName string to use

import boto3

sm = boto3.client("sagemaker")

resp = sm.list_feature_groups(MaxResults=50)
for fg in resp["FeatureGroupSummaries"]:
    print(fg["FeatureGroupName"])
    

combined-landmark-feature-group-01-23-17-25
combined-feature-group-01-23-09-39
combined-feature-group-01-22-33-03
combined-feature-group-01-07-36-12
combined-feature-group-01-05-14-57
catlm-feature-group-01-05-06-17
catlm-feature-group-01-04-44-17


In [90]:
offline_cfg = desc.get("OfflineStoreConfig", {})
print(offline_cfg)


{'S3StorageConfig': {'S3Uri': 's3://sagemaker-us-east-1-549206572067/cat-landmarks-project/feature-store', 'ResolvedOutputS3Uri': 's3://sagemaker-us-east-1-549206572067/cat-landmarks-project/feature-store/549206572067/sagemaker/us-east-1/offline-store/combined-landmark-feature-group-01-23-17-25-1769987970/data'}, 'DisableGlueTableCreation': False, 'DataCatalogConfig': {'TableName': 'combined_landmark_feature_group_01_23_17_25_1769987970', 'Catalog': 'AwsDataCatalog', 'Database': 'sagemaker_featurestore'}}


In [93]:
combined_query = combined_landmark_feature_group.athena_query()

query_string = """
SELECT *
FROM "{db}"."{table}"
WHERE dataset_split = 'test'
""".format(
    db=combined_query.database,
    table=combined_query.table_name
)


print("Running " + query_string)


combined_query.run(
    query_string=query_string,
    output_location = "s3://sagemaker-us-east-1-549206572067/athena-results/"

)

print("Running:\n", query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
# dataset = pd.DataFrame()
combined_query.run(
    query_string=query_string,
    output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/query_results/",
)
combined_query.wait()
dataset_test= combined_query.as_dataframe()

dataset_test.head(3)

Running 
SELECT *
FROM "sagemaker_featurestore"."combined_landmark_feature_group_01_23_17_25_1769987970"
WHERE dataset_split = 'test'

Running:
 
SELECT *
FROM "sagemaker_featurestore"."combined_landmark_feature_group_01_23_17_25_1769987970"
WHERE dataset_split = 'test'



Unnamed: 0,image_id,label,file_size,width,height,aspect_ratio,area,log_file_size,eye_center_x_norm,eye_center_y_norm,...,eye_angle,mouth_x_norm,mouth_y_norm,mouth_eye_y_norm,resolution_bucket,event_time,dataset_split,write_time,api_invocation_time,is_deleted
0,s3://sagemaker-us-east-1-549206572067/cat-land...,0,1634,32,32,1.0,1024,7.399398,,,...,,,,,,1769985000.0,test,2026-02-01 23:25:11.405,2026-02-01 23:20:14.000,False
1,s3://sagemaker-us-east-1-549206572067/cat-land...,1,113623,500,375,1.333333,187500,11.64065,0.462,0.549333,...,-0.496423,0.556,0.765333,0.450667,,1769985000.0,test,2026-02-01 23:25:11.405,2026-02-01 23:20:19.000,False
2,s3://sagemaker-us-east-1-549206572067/cat-land...,0,2596,32,32,1.0,1024,7.862112,,,...,,,,,,1769985000.0,test,2026-02-01 23:25:11.405,2026-02-01 23:20:20.000,False


In [94]:
combined_query = combined_landmark_feature_group.athena_query()

query_string = """
SELECT *
FROM "{db}"."{table}"
WHERE dataset_split = 'val'
""".format(
    db=combined_query.database,
    table=combined_query.table_name
)


print("Running " + query_string)


combined_query.run(
    query_string=query_string,
    output_location = "s3://sagemaker-us-east-1-549206572067/athena-results/"

)

print("Running:\n", query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
# dataset = pd.DataFrame()
combined_query.run(
    query_string=query_string,
    output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/query_results/",
)
combined_query.wait()
dataset_val = combined_query.as_dataframe()

dataset_val.head(3)

Running 
SELECT *
FROM "sagemaker_featurestore"."combined_landmark_feature_group_01_23_17_25_1769987970"
WHERE dataset_split = 'val'

Running:
 
SELECT *
FROM "sagemaker_featurestore"."combined_landmark_feature_group_01_23_17_25_1769987970"
WHERE dataset_split = 'val'



Unnamed: 0,image_id,label,file_size,width,height,aspect_ratio,area,log_file_size,eye_center_x_norm,eye_center_y_norm,...,eye_angle,mouth_x_norm,mouth_y_norm,mouth_eye_y_norm,resolution_bucket,event_time,dataset_split,write_time,api_invocation_time,is_deleted
0,s3://sagemaker-us-east-1-549206572067/cat-land...,1,126317,375,500,0.75,187500,11.746558,0.488,0.332,...,0.463648,0.450667,0.394,0.668,,1769985000.0,val,2026-02-01 23:25:11.562,2026-02-01 23:20:15.000,False
1,s3://sagemaker-us-east-1-549206572067/cat-land...,1,338261,1024,735,1.393197,752640,12.731576,0.383301,0.284354,...,0.067693,0.375,0.356463,0.715646,,1769985000.0,val,2026-02-01 23:25:11.562,2026-02-01 23:20:19.000,False
2,s3://sagemaker-us-east-1-549206572067/cat-land...,0,2132,32,32,1.0,1024,7.665285,,,...,,,,,,1769985000.0,val,2026-02-01 23:25:11.562,2026-02-01 23:20:20.000,False


In [95]:
combined_query = combined_landmark_feature_group.athena_query()

query_string = """
SELECT *
FROM "{db}"."{table}"
WHERE dataset_split = 'train'
""".format(
    db=combined_query.database,
    table=combined_query.table_name
)


print("Running " + query_string)


combined_query.run(
    query_string=query_string,
    output_location = "s3://sagemaker-us-east-1-549206572067/athena-results/"

)

print("Running:\n", query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
# dataset = pd.DataFrame()
combined_query.run(
    query_string=query_string,
    output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/query_results/",
)
combined_query.wait()
dataset_train = combined_query.as_dataframe()

dataset_train.head(3)

Running 
SELECT *
FROM "sagemaker_featurestore"."combined_landmark_feature_group_01_23_17_25_1769987970"
WHERE dataset_split = 'train'

Running:
 
SELECT *
FROM "sagemaker_featurestore"."combined_landmark_feature_group_01_23_17_25_1769987970"
WHERE dataset_split = 'train'



Unnamed: 0,image_id,label,file_size,width,height,aspect_ratio,area,log_file_size,eye_center_x_norm,eye_center_y_norm,...,eye_angle,mouth_x_norm,mouth_y_norm,mouth_eye_y_norm,resolution_bucket,event_time,dataset_split,write_time,api_invocation_time,is_deleted
0,s3://sagemaker-us-east-1-549206572067/cat-land...,1,222491,1024,683,1.499268,699392,12.312646,0.564453,0.545388,...,-0.156927,0.583008,0.740849,0.454612,,1769985000.0,train,2026-02-01 23:25:12.813,2026-02-01 23:20:14.000,False
1,s3://sagemaker-us-east-1-549206572067/cat-land...,1,130062,379,500,0.758,189500,11.775774,0.385224,0.32,...,0.068076,0.353562,0.438,0.68,,1769985000.0,train,2026-02-01 23:25:12.813,2026-02-01 23:20:15.000,False
2,s3://sagemaker-us-east-1-549206572067/cat-land...,0,2269,32,32,1.0,1024,7.727535,,,...,,,,,,1769985000.0,train,2026-02-01 23:25:12.813,2026-02-01 23:20:15.000,False


In [101]:
# Comments:
# - Uses s3_uri ie image_id to load image
# - Augment only cat images (label==1) during training
# - Resize for ALL images so shapes match
# - Normalize for ALL images

import numpy as np
from PIL import Image

def resize_image(image_np, size=(224, 224)):
    return np.array(Image.fromarray(image_np).resize(size, Image.BILINEAR))

class CatClsDataset:
    def __init__(self, df, augment=False, aug=None, image_size=(224, 224)):
        self.df = df.reset_index(drop=True)
        self.augment = augment
        self.aug = aug
        self.image_size = image_size

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        label = int(row["label"])

        img_np = np.array(read_image_from_s3(row["image_id"]))

        # augment only for cat images
        if self.augment and label == 1 and self.aug is not None:
            img_np = self.aug(image=img_np)["image"]

        # resize for all images (label 0 and 1)
        img_np = resize_image(img_np, size=self.image_size)

        # normalize for all images
        img_np = img_np.astype(np.float32) / 255.0

        return img_np, label



In [97]:
# Install albumentations 
%pip install albumentations --quiet


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-multimodal 1.4.0 requires nvidia-ml-py3<8.0,>=7.352.0, which is not installed.
sagemaker-studio 1.1.4 requires pydynamodb>=0.7.4, which is not installed.
amazon-sagemaker-jupyter-ai-q-developer 1.2.8 requires numpy<=2.0.1, but you have numpy 2.4.2 which is incompatible.
amazon-sagemaker-sql-magic 0.1.4 requires numpy<2, but you have numpy 2.4.2 which is incompatible.
autogluon-common 1.4.0 requires numpy<2.4.0,>=1.25.0, but you have numpy 2.4.2 which is incompatible.
autogluon-core 1.4.0 requires numpy<2.4.0,>=1.25.0, but you have numpy 2.4.2 which is incompatible.
autogluon-features 1.4.0 requires numpy<2.4.0,>=1.25.0, but you have numpy 2.4.2 which is incompatible.
autogluon-multimodal 1.4.0 requires numpy<2.4.0,>=1.25.0, but you have numpy 2.4.2 which is incompatible.
autogluon-multimodal 1.4.0 r

In [102]:
# Augmentation applies ONLY to cat images (label == 1)
import albumentations as A

cat_aug = A.Compose(
    [
        A.HorizontalFlip(p=0.5),
        A.ShiftScaleRotate(
            shift_limit=0.05,
            scale_limit=0.10,
            rotate_limit=15,
            p=0.7
        ),
        A.RandomBrightnessContrast(p=0.4),
    ],
    keypoint_params=A.KeypointParams(format="xy", remove_invisible=True)
)

def augment_cat_only(image_np, keypoints_xy):
    out = cat_aug(image=image_np, keypoints=keypoints_xy)
    return out["image"], out["keypoints"]


In [103]:
train_ds = CatClsDataset(dataset_train, augment=True,  aug=cat_aug, image_size=(224, 224))
val_ds   = CatClsDataset(dataset_val,   augment=False, aug=None,   image_size=(224, 224))
test_ds  = CatClsDataset(dataset_test,  augment=False, aug=None,   image_size=(224, 224))


###  Verifying if Resizing worked

In [104]:
img_cat, y_cat = train_ds[dataset_train[dataset_train["label"]==1].index[0]]
img_nc,  y_nc  = train_ds[dataset_train[dataset_train["label"]==0].index[0]]

print(y_cat, img_cat.shape)
print(y_nc,  img_nc.shape)


1 (224, 224, 3)
0 (224, 224, 3)


### Verifying if Augemntation worked

In [105]:
cat_idx = dataset_train[dataset_train["label"]==1].index[0]
img1, y1 = train_ds[cat_idx]
img2, y2 = train_ds[cat_idx]
print("cat diff:", np.mean(np.abs(img1 - img2)))

noncat_idx = dataset_train[dataset_train["label"]==0].index[0]
img1, y1 = train_ds[noncat_idx]
img2, y2 = train_ds[noncat_idx]
print("noncat diff:", np.mean(np.abs(img1 - img2)))


cat diff: 0.27106896
noncat diff: 0.0


In [106]:
# Comments:
# - Creates small manifest files that point to image locations + labels
# - These CSVs are NOT the images; they just index them

import pandas as pd
import os

local_dir = "/tmp/cat_manifests"
os.makedirs(local_dir, exist_ok=True)

def write_manifest(df, out_path):
    df[["image_id", "label"]].to_csv(out_path, index=False)

write_manifest(dataset_train, f"{local_dir}/train.csv")
write_manifest(dataset_val,   f"{local_dir}/val.csv")
write_manifest(dataset_test,  f"{local_dir}/test.csv")

print("Wrote:", os.listdir(local_dir))


Wrote: ['train.csv', 'val.csv', 'test.csv']


In [107]:

#  Uploading manifest CSVs to S3 so SageMaker training job can download them

s3_base = "cat-landmarks-project/feature-store/manifests"

s3.upload_file(f"{local_dir}/train.csv", bucket, f"{s3_base}/train/train.csv")
s3.upload_file(f"{local_dir}/val.csv",   bucket, f"{s3_base}/val/val.csv")
s3.upload_file(f"{local_dir}/test.csv",  bucket, f"{s3_base}/test/test.csv")

print("Uploaded manifests.")


Uploaded manifests.


## Modelling

In [110]:

import sagemaker
from sagemaker.pytorch import PyTorch

sess = sagemaker.Session()
role = sagemaker.get_execution_role()

training_output_path = f"s3://{default_s3_bucket_name}/{prefix}/training_output"


from sagemaker.pytorch import PyTorch

training_model = PyTorch(
    entry_point="train.py",        
    source_dir="src",              
    role=role,
    framework_version="2.1",
    py_version="py310",
    instance_count=1,
    instance_type="ml.m5.large",  
    output_path=training_output_path,
    hyperparameters={
        "epochs": 2,
        "batch_size": 32,
        "lr": 1e-3,
        "image_size": 224
    },
)






In [113]:
inputs = {
    "train": f"s3://{bucket}/{s3_base}/train/",
    "val":   f"s3://{bucket}/{s3_base}/val/",
}
training_model.fit(inputs=inputs, logs=True)


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2026-02-02-00-01-00-584


2026-02-02 00:01:05 Starting - Starting the training job...
2026-02-02 00:01:22 Starting - Preparing the instances for training...
2026-02-02 00:01:43 Downloading - Downloading input data...
2026-02-02 00:02:29 Downloading - Downloading the training image.........
2026-02-02 00:03:55 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2026-02-02 00:03:59,061 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2026-02-02 00:03:59,062 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2026-02-02 00:03:59,062 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2026-02-02 00:03:59,073 sagemaker_pytorch_container.training INFO     Bloc

In [124]:

# Loading  train/val manifests from the exact S3 locations

import pandas as pd

bucket = "sagemaker-us-east-1-549206572067"  # use the same bucket variable you used for upload
s3_base = "cat-landmarks-project/feature-store/manifests"

train_df = pd.read_csv(f"s3://{bucket}/{s3_base}/train/train.csv")
val_df   = pd.read_csv(f"s3://{bucket}/{s3_base}/val/val.csv")
test_df  = pd.read_csv(f"s3://{bucket}/{s3_base}/test/test.csv")

print("Val class balance:")
print(val_df["label"].value_counts(dropna=False))
print(val_df["label"].value_counts(normalize=True, dropna=False))

#  checking for leakage (train vs val)
overlap_tv = set(train_df["image_id"]).intersection(set(val_df["image_id"]))
print("Train/Val overlap:", len(overlap_tv))

# checking for any overlap also ensure test is clean
overlap_tt = set(train_df["image_id"]).intersection(set(test_df["image_id"]))
overlap_vt = set(val_df["image_id"]).intersection(set(test_df["image_id"]))
print("Train/Test overlap:", len(overlap_tt))
print("Val/Test overlap:", len(overlap_vt))


Val class balance:
label
0    1507
1    1018
Name: count, dtype: int64
label
0    0.596832
1    0.403168
Name: proportion, dtype: float64
Train/Val overlap: 0
Train/Test overlap: 0
Val/Test overlap: 0
