In [None]:
# Step 1: Split Data and Create a Training Dataset

In [95]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sagemaker.s3 import S3Downloader
from sagemaker.inputs import TrainingInput
from sagemaker import image_uris
from sagemaker.estimator import Estimator
import os

# === Re-Train and Evaluate - Definitive Final Script (No Data Leakage) ===

# --- Step 1: Prepare Final DataFrame ---
print("--- Step 1: Preparing Final DataFrame for Training ---")
df_for_training = df_cleaned.copy()

# A. Create the integer-encoded target column
area_name_map = {name: i for i, name in enumerate(df_cleaned['area name'].unique())}
df_for_training['area_name_target'] = df_for_training['area name'].map(area_name_map)
reverse_area_name_map = {v: k for k, v in area_name_map.items()}

# B. Create engineered features
df_for_training['occ_hour'] = df_for_training['time occ'] // 100
df_for_training['date_occ_dt'] = pd.to_datetime(df_for_training['date occ'], errors='coerce')
df_for_training['day_of_week'] = df_for_training['date_occ_dt'].dt.dayofweek
df_for_training['month'] = df_for_training['date_occ_dt'].dt.month
df_for_training['is_weekend'] = (df_for_training['day_of_week'] >= 5).astype(int)
df_for_training = pd.get_dummies(df_for_training, columns=['vict sex'], drop_first=True, dtype=int)

# C. Define X and y with NO leaky features
y = df_for_training['area_name_target']
# THIS IS THE CRITICAL FIX: 'area' and 'rpt dist no' are both removed.
final_feature_columns = [
    'part 1-2', 'crm cd', 'vict age', 'premis cd', 'lat', 'lon', 
    'occ_hour', 'day_of_week', 'month', 'is_weekend', 'vict sex_M'
]
X = df_for_training[final_feature_columns]
print(f"✅ Final features (X) and target (y) created with {X.shape[1]} non-leaky features.")


# --- Step 2: Split Data, Create & Upload Clean CSVs ---
print("\n--- Step 2: Splitting Data and Uploading to S3 ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Prep for training
train_df = pd.concat([y_train, X_train], axis=1)
train_df.to_csv("train_final_v3.csv", header=False, index=False)
s3_train_path = sagemaker_session.upload_data("train_final_v3.csv", key_prefix=f"{project_prefix}/datasets/final_train_v3")
# Prep for batch transform (features only)
X_test.to_csv("test_features_final_v3.csv", header=False, index=False)
s3_test_path = sagemaker_session.upload_data("test_features_final_v3.csv", key_prefix=f"{project_prefix}/datasets/final_test_features_v3")
print("✅ Training and Test CSVs created and uploaded.")


# --- Step 3: Train the XGBoost Model (v3) ---
print("\n--- Step 3: Configuring and Starting Training Job ---")
train_input = TrainingInput(s3_train_path, content_type="csv")
num_location_classes = y.nunique()
xgboost_container = image_uris.retrieve(framework="xgboost", region=region, version="1.7-1")
hyperparameters = {
    "objective": "multi:softmax", "num_class": num_location_classes, "max_depth": 5, "eta": 0.2,
    "gamma": 4, "min_child_weight": 6, "subsample": 0.8, "eval_metric": "merror", "num_round": 150
}
xgb_estimator_location_v3 = Estimator(
    image_uri=xgboost_container, hyperparameters=hyperparameters, role=role, instance_count=1, instance_type="ml.m5.xlarge",
    output_path=f"s3://{bucket}/{project_prefix}/training-output/location-model-v3", sagemaker_session=sagemaker_session
)
xgb_estimator_location_v3.fit({"train": train_input})
print("\n✅ Training job complete.")


# --- Step 4: Run Batch Transform Job ---
print("\n--- Step 4: Starting Batch Transform Job for Evaluation ---")
transformer_loc_v3 = xgb_estimator_location_v3.transformer(instance_count=1, instance_type='ml.m5.xlarge')
transformer_loc_v3.transform(data=s3_test_path, content_type='text/csv', split_type='Line')
print("Waiting for Batch Transform job to complete...")
transformer_loc_v3.wait()
print("\n✅ Batch Transform job complete.")


# --- Step 5: Download Predictions and Show Final Report ---
print("\n--- Step 5: Downloading Predictions and Evaluating ---")
prediction_s3_path = transformer_loc_v3.output_path
local_predictions_path = "test_predictions_final.csv"
output_filename = "test_features_final_v3.csv.out"
sagemaker_session.download_data(path=".", bucket=bucket, key_prefix=f"{prediction_s3_path.replace(f's3://{bucket}/', '')}/{output_filename}")
os.rename(output_filename, local_predictions_path)
y_pred_loc_final = pd.read_csv(local_predictions_path, header=None).values.flatten()
target_names = [reverse_area_name_map[i] for i in sorted(reverse_area_name_map)]
print("\n--- Final, Realistic Model Performance ---")
print("Classification Report:")
print(classification_report(y_test, y_pred_loc_final, target_names=target_names, zero_division=0))

--- Step 1: Preparing Final DataFrame for Training ---
✅ Final features (X) and target (y) created with 11 non-leaky features.

--- Step 2: Splitting Data and Uploading to S3 ---


  df_for_training['date_occ_dt'] = pd.to_datetime(df_for_training['date occ'], errors='coerce')
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-06-24-04-58-32-936


✅ Training and Test CSVs created and uploaded.

--- Step 3: Configuring and Starting Training Job ---
2025-06-24 04:58:38 Starting - Starting the training job...
2025-06-24 04:58:53 Starting - Preparing the instances for training...
2025-06-24 04:59:37 Downloading - Downloading the training image......
2025-06-24 05:00:38 Training - Training image download completed. Training in progress...[34m[2025-06-24 05:00:40.220 ip-10-0-81-137.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-06-24 05:00:40.243 ip-10-0-81-137.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-06-24:05:00:40:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-06-24:05:00:40:INFO] Failed to parse hyperparameter eval_metric value merror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-06-24:05:00:40:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value i

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-06-24-05-01-20-020


Training seconds: 106
Billable seconds: 106

✅ Training job complete.

--- Step 4: Starting Batch Transform Job for Evaluation ---


INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2025-06-24-05-01-20-712


................................[34m[2025-06-24:05:06:39:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-24:05:06:39:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-24:05:06:39:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[35m[2025-06-24:05:06:39:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2025-06-24:05:06:39:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2025-06-24:05:06:39:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;[0m
[35mworker_rlimit_nofile 4096;[0m
[35mevents {
  worker_connections 2048;[0m
[35m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

In [None]:
# Model Evaluation

In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sagemaker.s3 import S3Downloader
import os

# === Evaluate Final Model - All-in-One Script ===

# --- Step 1: Re-create the Test Set to Ensure All Variables are Defined ---
print("--- Step 1: Preparing Test Data ---")
# Start from the cleaned dataframe
df_for_eval = df_cleaned.copy()

# A. Create the integer-encoded target column
area_name_map = {name: i for i, name in enumerate(df_cleaned['area name'].unique())}
df_for_eval['area_name_target'] = df_for_eval['area name'].map(area_name_map)
reverse_area_name_map = {v: k for k, v in area_name_map.items()}

# B. Create engineered features
df_for_eval = pd.get_dummies(df_for_eval, columns=['vict sex'], drop_first=True, dtype=int)
df_for_eval['date_occ_dt'] = pd.to_datetime(df_for_eval['date occ'], errors='coerce')
df_for_eval['day_of_week'] = df_for_eval['date_occ_dt'].dt.dayofweek
# Add any other temporal features your final model used...


# C. Define X and y with ONLY the final, non-leaky features
y = df_for_eval['area_name_target']
final_feature_columns = [
    'rpt dist no', 'part 1-2', 'crm cd', 'vict age', 'premis cd', 
    'lat', 'lon', 'day_of_week', 'vict sex_M'
]
X = df_for_eval[final_feature_columns]

# D. Create the train/test split to get the exact same test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("✅ Test set created successfully.")


# --- Step 2: Upload Test Features for Batch Job ---
print("\n--- Step 2: Preparing and Uploading Test Data ---")
X_test.to_csv("test_features_final.csv", header=False, index=False)
s3_test_path = sagemaker_session.upload_data(
    "test_features_final.csv", 
    key_prefix=f"{project_prefix}/datasets/final_test_features_v8" # New version
)
print(f"✅ Test features uploaded to S3: {s3_test_path}")


# --- Step 3: Run the Batch Transform Job ---
print("\n--- Step 3: Creating Transformer and Starting Job ---")
# Use the xgb_estimator_location_v3 object from your last successful training job
transformer = xgb_estimator_location_v3.transformer(
    instance_count=1,
    instance_type='ml.m5.xlarge'
)
transformer.transform(
    data=s3_test_path,
    content_type='text/csv',
    split_type='Line'
)
print("Waiting for Batch Transform job to complete (this may take 5-10 minutes)...")
transformer.wait()
print("\n✅ Batch Transform job complete. Predictions are ready in S3.")


# --- Step 4: Download Predictions and Show Report ---
print("\n--- Step 4: Downloading Predictions and Evaluating ---")
prediction_s3_path = transformer.output_path
local_predictions_path = "test_predictions_final.csv"
output_filename = "test_features_final.csv.out"
sagemaker_session.download_data(path=".", bucket=bucket, key_prefix=f"{prediction_s3_path.replace(f's3://{bucket}/', '')}/{output_filename}")
os.rename(output_filename, local_predictions_path)

y_pred_final = pd.read_csv(local_predictions_path, header=None).values.flatten()
target_names = [reverse_area_name_map[i] for i in sorted(reverse_area_name_map)]

print("\n--- Final, Realistic Model Performance ---")
print("Classification Report:")
print(classification_report(y_test, y_pred_final, target_names=target_names, zero_division=0))

--- Step 1: Preparing Test Data ---
✅ Test set created successfully.

--- Step 2: Preparing and Uploading Test Data ---
✅ Test features uploaded to S3: s3://sagemaker-us-east-1-564543410445/crime-prediction-datalake/datasets/final_test_features_v8/test_features_final.csv

--- Step 3: Creating Transformer and Starting Job ---


  df_for_eval['date_occ_dt'] = pd.to_datetime(df_for_eval['date occ'], errors='coerce')
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-06-24-05-20-25-100
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2025-06-24-05-20-25-909


...............................[34m[2025-06-24:05:25:41:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-24:05:25:41:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-24:05:25:41:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }
    locati

In [99]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sagemaker.s3 import S3Downloader
from sagemaker.inputs import TrainingInput
from sagemaker import image_uris
from sagemaker.estimator import Estimator
import os
import numpy as np # Ensure numpy is imported

# === Train and Evaluate Model 1: Crime Category (Logistic Regression) - Final Version ===

# --- Step 1: Prepare Final DataFrame ---
print("--- Step 1: Preparing Data for Model 1 ---")
df_for_training = df_cleaned.copy()
df_for_training['part_1_2_target'] = df_for_training['part 1-2'].replace({2: 0})
y = df_for_training['part_1_2_target']
df_for_training = pd.get_dummies(df_for_training, columns=['vict sex'], drop_first=True, dtype=int)
df_for_training['date_occ_dt'] = pd.to_datetime(df_for_training['date occ'], errors='coerce')
df_for_training['day_of_week'] = df_for_training['date_occ_dt'].dt.dayofweek
final_feature_columns = ['vict age', 'lat', 'lon', 'day_of_week', 'vict sex_M']
X = df_for_training[final_feature_columns]
print("✅ Final features (X) and target (y) created for Model 1.")

# --- Step 2: Split Data, Create & Upload Clean CSVs ---
print("\n--- Step 2: Splitting Data and Uploading to S3 ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
train_df = pd.concat([y_train, X_train], axis=1)
train_df.to_csv("train_model1.csv", header=False, index=False)
s3_train_path = sagemaker_session.upload_data("train_model1.csv", key_prefix=f"{project_prefix}/datasets/model1_train_v2")
X_test.to_csv("test_features_model1.csv", header=False, index=False)
s3_test_path = sagemaker_session.upload_data("test_features_model1.csv", key_prefix=f"{project_prefix}/datasets/model1_test_features_v2")
print("✅ Training and Test CSVs for Model 1 created and uploaded.")

# --- Step 3: Train the Linear Learner Model ---
print("\n--- Step 3: Configuring and Starting Linear Learner Training Job ---")
linear_learner_container = image_uris.retrieve(framework="linear-learner", region=region, version="1")
linear_estimator = Estimator(
    image_uri=linear_learner_container, role=role, instance_count=1, instance_type="ml.m5.xlarge",
    output_path=f"s3://{bucket}/{project_prefix}/training-output/linear-model", sagemaker_session=sagemaker_session
)
linear_estimator.set_hyperparameters(predictor_type='binary_classifier', feature_dim=X_train.shape[1], mini_batch_size=100)
train_input = TrainingInput(s3_train_path, content_type="text/csv")
linear_estimator.fit({"train": train_input})
print("\n✅ Training job for Linear Learner complete.")

# --- Step 4: Evaluate the Model with Batch Transform ---
print("\n--- Step 4: Starting Batch Transform Job for Evaluation ---")
transformer = linear_estimator.transformer(instance_count=1, instance_type='ml.m5.xlarge', accept="text/csv")
transformer.transform(data=s3_test_path, content_type='text/csv', split_type='Line')
print("Waiting for Batch Transform job to complete...")
transformer.wait()
print("\n✅ Batch Transform job complete.")

# --- Step 5: Download Predictions and Show Report ---
print("\n--- Step 5: Downloading Predictions and Evaluating ---")
prediction_s3_path = transformer.output_path
local_predictions_path = "test_predictions_model1.csv"
output_filename = "test_features_model1.csv.out"
sagemaker_session.download_data(path=".", bucket=bucket, key_prefix=f"{prediction_s3_path.replace(f's3://{bucket}/', '')}/{output_filename}")
os.rename(output_filename, local_predictions_path)

# THIS IS THE FIX: Load the simple CSV output directly with pandas
# The file contains one column with the predicted label.
y_pred_df = pd.read_csv(local_predictions_path, header=None)
y_pred = y_pred_df[0].values

print("\n--- Final Model Performance (Model 1: Logistic Regression) ---")
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

  df_for_training['date_occ_dt'] = pd.to_datetime(df_for_training['date occ'], errors='coerce')
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.


--- Step 1: Preparing Data for Model 1 ---
✅ Final features (X) and target (y) created for Model 1.

--- Step 2: Splitting Data and Uploading to S3 ---
✅ Training and Test CSVs for Model 1 created and uploaded.

--- Step 3: Configuring and Starting Linear Learner Training Job ---


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: linear-learner-2025-06-24-05-48-16-777


2025-06-24 05:48:17 Starting - Starting the training job...
2025-06-24 05:48:43 Starting - Preparing the instances for training...
2025-06-24 05:49:22 Downloading - Downloading the training image.........
2025-06-24 05:50:33 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/24/2025 05:50:42 INFO 139641928038208] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile':

INFO:sagemaker:Creating model with name: linear-learner-2025-06-24-05-51-34-156


Training seconds: 124
Billable seconds: 124

✅ Training job for Linear Learner complete.

--- Step 4: Starting Batch Transform Job for Evaluation ---


INFO:sagemaker:Creating transform job with name: linear-learner-2025-06-24-05-51-34-850


....................................
.[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[06/24/2025 05:57:38 INFO 140344041035584] Memory profiler is not enabled by the environment variable ENABLE_PROFILER.[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[35mDocker entrypoint called with argument(s): serve[0m
[35mRunning default environment configuration script[0m
[35m[06/24/2025 05:57:38 INFO 140344041035584] Memory profiler is not enabled by the environment variable ENABLE_PROFILER.[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[06/24/2025 05:57:41 INFO 140344041035584] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[06/24/2025 05:57:41 INFO 140344041035584] loading entry points[0m
[34m[06/24/2025 05:57:41 INFO 140344041035