In [1]:
pip install pytorch-lightning torch optuna scikit-learn pandas matplotlib seaborn pytorch-forecasting


Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install tensorflow==2.17.0 pandas numpy scikit-learn matplotlib seaborn keras-tuner


Collecting tensorflow==2.17.0
  Using cached tensorflow-2.17.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow==2.17.0)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow==2.17.0)
  Using cached flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow==2.17.0)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow==2.17.0)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow==2.17.0)
  Using cached libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting ml-dtypes<0.5.0,>=0.3.1 (from tensorflow==2.17.0)
  Downloading ml_dtypes-0.4.1-cp312-cp312-macosx_10_9_universal2.whl.metadata (20 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow==2.17.0)
  Using cached opt_

In [35]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the cleaned dataset
df = pd.read_csv("/Users/Intel/Desktop/spring 2025/CAPSTONE/cleaned_data.csv")

# Display basic information
print(df.info())

# Display the first few rows
print(df.head())

# Check for missing values
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389895 entries, 0 to 389894
Data columns (total 20 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   OBJECTID                389895 non-null  int64  
 1   INCIDENT_ID             389895 non-null  int64  
 2   OFFENSE_ID              389895 non-null  int64  
 3   OFFENSE_CODE            389895 non-null  int64  
 4   OFFENSE_CODE_EXTENSION  389895 non-null  int64  
 5   OFFENSE_TYPE_ID         389895 non-null  object 
 6   OFFENSE_CATEGORY_ID     389895 non-null  object 
 7   FIRST_OCCURRENCE_DATE   389895 non-null  object 
 8   REPORTED_DATE           389895 non-null  object 
 9   INCIDENT_ADDRESS        389895 non-null  object 
 10  GEO_X                   389895 non-null  int64  
 11  GEO_Y                   389895 non-null  int64  
 12  GEO_LON                 389895 non-null  float64
 13  GEO_LAT                 389895 non-null  float64
 14  DISTRICT_ID         

In [36]:
# Ensure FIRST_OCCURRENCE_DATE is in datetime format
df["FIRST_OCCURRENCE_DATE"] = pd.to_datetime(df["FIRST_OCCURRENCE_DATE"])

# Sort by date (important for sequential modeling)
df = df.sort_values("FIRST_OCCURRENCE_DATE").reset_index(drop=True)

# Create a time index (days since the first record)
df["time_idx"] = (df["FIRST_OCCURRENCE_DATE"] - df["FIRST_OCCURRENCE_DATE"].min()).dt.days

# Verify that time_idx exists
print(df[["FIRST_OCCURRENCE_DATE", "time_idx"]].head())


  FIRST_OCCURRENCE_DATE  time_idx
0   2019-01-02 00:01:00         0
1   2019-01-02 00:10:00         0
2   2019-01-02 01:11:00         0
3   2019-01-02 01:30:00         0
4   2019-01-02 01:30:00         0


In [38]:
import torch
from pytorch_forecasting import TimeSeriesDataSet

# Define forecasting parameters
max_encoder_length = 90  # Past 90 days used for training
max_prediction_length = 30  # Predict the next 30 days

# Define the TimeSeriesDataSet
dataset = TimeSeriesDataSet(
    df,
    time_idx="time_idx",  # Time index for sequential data
    target="VICTIM_COUNT",  # Predicting crime count
    group_ids=["DISTRICT_ID"],  # Grouping by districts
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_known_reals=["time_idx"],  # Time index as known future values
    time_varying_unknown_reals=["VICTIM_COUNT", "GEO_LON", "GEO_LAT"],  # Unknown values we predict
    static_categoricals=["DISTRICT_ID", "NEIGHBORHOOD_ID", "OFFENSE_CATEGORY_ID"],  # Static categorical data
    allow_missing_timesteps=True,  # Allow for missing days
)

print("✅ TimeSeriesDataSet created with Sliding Window Approach!")


✅ TimeSeriesDataSet created with Sliding Window Approach!


In [40]:
#Train Set: Use older dates for training.
#Validation Set: Use a recent portion for tuning.
#Test Set: Use the most recent unseen data.
# Split dataset into training (70%), validation (20%), and test (10%) based on time
train_idx = int(len(df) * 0.7)
val_idx = int(len(df) * 0.9)

# Training, validation, and testing sets
train_data = TimeSeriesDataSet.from_dataset(dataset, df.iloc[:train_idx])
val_data = TimeSeriesDataSet.from_dataset(dataset, df.iloc[train_idx:val_idx])
test_data = TimeSeriesDataSet.from_dataset(dataset, df.iloc[val_idx:])

# Convert to PyTorch DataLoaders
batch_size = 64

train_dataloader = train_data.to_dataloader(train=True, batch_size=batch_size, shuffle=True)
val_dataloader = val_data.to_dataloader(train=False, batch_size=batch_size, shuffle=False)
test_dataloader = test_data.to_dataloader(train=False, batch_size=batch_size, shuffle=False)

print("✅ Data successfully split for training, validation, and testing!")


✅ Data successfully split for training, validation, and testing!


In [43]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

# ------------------- Feature Engineering -------------------

# Convert FIRST_OCCURRENCE_DATE to datetime
df["FIRST_OCCURRENCE_DATE"] = pd.to_datetime(df["FIRST_OCCURRENCE_DATE"])

# Sort by date and create time_idx
df = df.sort_values("FIRST_OCCURRENCE_DATE").reset_index(drop=True)
df["time_idx"] = (df["FIRST_OCCURRENCE_DATE"] - df["FIRST_OCCURRENCE_DATE"].min()).dt.days

# Extract useful time-based features
df["year"] = df["FIRST_OCCURRENCE_DATE"].dt.year
df["month"] = df["FIRST_OCCURRENCE_DATE"].dt.month
df["day"] = df["FIRST_OCCURRENCE_DATE"].dt.day
df["weekday"] = df["FIRST_OCCURRENCE_DATE"].dt.weekday

# ------------------- Encoding and Scaling -------------------

# Encode categorical columns
for col in ["DISTRICT_ID", "NEIGHBORHOOD_ID", "OFFENSE_CATEGORY_ID"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Scale continuous columns
scaler = StandardScaler()
df[["GEO_LON", "GEO_LAT", "VICTIM_COUNT"]] = scaler.fit_transform(df[["GEO_LON", "GEO_LAT", "VICTIM_COUNT"]])

# ------------------- Data Split -------------------

train_df = df[df["FIRST_OCCURRENCE_DATE"] < "2022-01-01"]
val_df = df[(df["FIRST_OCCURRENCE_DATE"] >= "2022-01-01") & (df["FIRST_OCCURRENCE_DATE"] < "2022-07-01")]
test_df = df[df["FIRST_OCCURRENCE_DATE"] >= "2022-07-01"]

# ------------------- TFT Model (Keras Custom Implementation) -------------------

def build_tft_model():
    input_seq = keras.Input(shape=(90, 7))

    # Static embedding input (optional in this simplified version)
    lstm_out = layers.LSTM(32, return_sequences=True)(input_seq)
    attention = layers.Attention()([lstm_out, lstm_out])
    flatten = layers.GlobalAveragePooling1D()(attention)
    
    output = layers.Dense(1)(flatten)

    model = keras.Model(inputs=input_seq, outputs=output)
    model.compile(optimizer='adam', loss='mse')
    return model

model = build_tft_model()

print("✅ TensorFlow TFT model built and compiled!")


✅ TensorFlow TFT model built and compiled!


In [45]:
import tensorflow as tf
import numpy as np

# Parameters
encoder_length = 90  # past days
decoder_length = 30  # predict next days
window_size = encoder_length + decoder_length

# Create the feature array (we'll focus on "VICTIM_COUNT" for now)
data = df[['VICTIM_COUNT']].values.astype(np.float32)

# Create tf.data.Dataset with sliding window
dataset = tf.data.Dataset.from_tensor_slices(data)
dataset = dataset.window(window_size, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_size))

# Split into encoder (input) and decoder (target)
dataset = dataset.map(lambda window: (
    window[:encoder_length],  # encoder input
    window[encoder_length:]   # decoder target
))

# Shuffle and batch
dataset = dataset.shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)

print("✅ Sliding window dataset ready for TFT model!")

✅ Sliding window dataset ready for TFT model!


In [47]:
# Split your dataframe based on time
train_df = df[df['FIRST_OCCURRENCE_DATE'] < "2022-01-01"]
val_df = df[(df['FIRST_OCCURRENCE_DATE'] >= "2022-01-01") & (df['FIRST_OCCURRENCE_DATE'] < "2022-07-01")]


In [49]:
# Define target and feature columns
target = "VICTIM_COUNT"
features = ['time_idx', 'GEO_LON', 'GEO_LAT', 'VICTIM_COUNT', 'DISTRICT_ID', 'NEIGHBORHOOD_ID', 'OFFENSE_CATEGORY_ID']
 # Add more features if needed like district/other static vars


In [51]:
# Prepare datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_df[features].values, train_df[target].values))
val_dataset = tf.data.Dataset.from_tensor_slices((val_df[features].values, val_df[target].values))

# Apply batching and prefetching
tf_dataset = train_dataset.shuffle(1024).batch(64).prefetch(tf.data.AUTOTUNE)
tf_val_dataset = val_dataset.batch(64).prefetch(tf.data.AUTOTUNE)


In [53]:
# Prepare tf.data.Dataset objects for training and validation
tf_dataset = dataset.shuffle(1024).batch(64).prefetch(tf.data.AUTOTUNE)
tf_val_dataset = val_dataset.batch(64).prefetch(tf.data.AUTOTUNE)


In [55]:
# Reshape before batching: Add feature dimension (assuming 1 feature for now)
train_dataset = tf.data.Dataset.from_tensor_slices((train_df[features].values[..., np.newaxis], train_df[target].values))
val_dataset = tf.data.Dataset.from_tensor_slices((val_df[features].values[..., np.newaxis], val_df[target].values))

# Batch and prefetch
tf_dataset = train_dataset.shuffle(1024).batch(64).prefetch(tf.data.AUTOTUNE)
tf_val_dataset = val_dataset.batch(64).prefetch(tf.data.AUTOTUNE)


In [57]:
features = ['time_idx', 'GEO_LON', 'GEO_LAT', 'VICTIM_COUNT', 'DISTRICT_ID', 'NEIGHBORHOOD_ID', 'OFFENSE_CATEGORY_ID']


In [63]:
# Sliding window on features and target
def create_sliding_windows(df, seq_length, target_col):
    X, y = [], []
    for i in range(len(df) - seq_length):
        X.append(df[features].iloc[i:i+seq_length].values)
        y.append(df[target_col].iloc[i+seq_length])
    return np.array(X), np.array(y)

# Apply to train and validation
X_train, y_train = create_sliding_windows(train_df, 90, target)
X_val, y_val = create_sliding_windows(val_df, 90, target)

print(X_train.shape)  # Should be (num_samples, 90, 7)
print(y_train.shape)  # Should be (num_samples,)

# TensorFlow dataset
tf_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(64).prefetch(tf.data.AUTOTUNE)
tf_val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(64).prefetch(tf.data.AUTOTUNE)


(186018, 90, 7)
(186018,)


In [60]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,  # Stop if no improvement for 5 epochs
    restore_best_weights=True
)

# Train the model
history = model.fit(
    tf_dataset,
    validation_data=tf_val_dataset,
    epochs=30,
    callbacks=[early_stopping],
    verbose=1
)

print("✅ TFT model training complete!")


Epoch 1/30
[1m2907/2907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 13ms/step - loss: 0.8834 - val_loss: 0.9307
Epoch 2/30
[1m2907/2907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 13ms/step - loss: 0.8812 - val_loss: 0.9307
Epoch 3/30
[1m2907/2907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 13ms/step - loss: 0.8796 - val_loss: 0.9307
Epoch 4/30
[1m2907/2907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 14ms/step - loss: 0.8789 - val_loss: 0.9307
Epoch 5/30
[1m2907/2907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 13ms/step - loss: 0.8789 - val_loss: 0.9307
Epoch 6/30
[1m2907/2907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 15ms/step - loss: 0.8787 - val_loss: 0.9307
✅ TFT model training complete!


In [61]:
hidden_size = 64  # increase from 16
attention_head_size = 8  # increase from 4
dropout = 0.2  # slight increase for regularization


In [62]:
df['day_of_week'] = df['FIRST_OCCURRENCE_DATE'].dt.dayofweek
df['month'] = df['FIRST_OCCURRENCE_DATE'].dt.month
df['week_of_year'] = df['FIRST_OCCURRENCE_DATE'].dt.isocalendar().week


In [19]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)


In [21]:
EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


<keras.src.callbacks.early_stopping.EarlyStopping at 0x303cafa40>

In [23]:
# Feature engineering BEFORE splitting
df['day_of_week'] = df['FIRST_OCCURRENCE_DATE'].dt.dayofweek
df['week_of_year'] = df['FIRST_OCCURRENCE_DATE'].dt.isocalendar().week

# Scaling features
features = ["GEO_LON", "GEO_LAT", "time_idx", "day_of_week", "week_of_year"]
target = "VICTIM_COUNT"

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Now split into train and validation
train_size = int(len(df) * 0.7)
train_df = df.iloc[:train_size]
val_df = df.iloc[train_size:]

# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_df[features].values, train_df[target].values))
val_dataset = tf.data.Dataset.from_tensor_slices((val_df[features].values, val_df[target].values))

# Batch and prefetch
train_dataset = train_dataset.batch(64).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(64).prefetch(tf.data.AUTOTUNE)


In [33]:
# Step 1: Increase sequence length to use more time context
sequence_length = 30  # instead of 1
target = "VICTIM_COUNT"

# Step 2: Prepare sliding window datasets correctly (no extra wrapper needed)
train_dataset = tf.keras.utils.timeseries_dataset_from_array(
    data=train_df[features].values,
    targets=train_df[target].values,
    sequence_length=sequence_length,
    batch_size=64
)

val_dataset = tf.keras.utils.timeseries_dataset_from_array(
    data=val_df[features].values,
    targets=val_df[target].values,
    sequence_length=sequence_length,
    batch_size=64
)

# Prefetch to improve performance
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)

# Step 3: Rebuild model with larger hidden size and attention heads
model = build_tft_model(
    input_shape=(sequence_length, len(features)),
    hidden_size=128,  # increase model capacity
    attention_heads=8,
    dropout=0.1  # reduce regularization slightly
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')

# Step 4: Retrain with updated config
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=30,
    callbacks=[early_stopping],
    verbose=1
)

print("✅ Model retrained with updated hyperparameters!")


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).