In [2]:
import pandas as pd
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adam

df = pd.read_csv("combined_data.csv")

spatial_features = ['city', 'location_area']
temporal_features = ['year', 'month', 'day', 'hour', 'dayofweek']
environmental_features = ['population', 'crime_rate_per_1000_people']
target_col = 'offense_category_name'

X_spatial = df[spatial_features].copy()
X_temporal = df[temporal_features].copy()
X_env = df[environmental_features].copy()
y = df[target_col].copy()

X_temporal['hour_sin'] = np.sin(2 * np.pi * X_temporal['hour'] / 24)
X_temporal['hour_cos'] = np.cos(2 * np.pi * X_temporal['hour'] / 24)
X_temporal['dayofweek_sin'] = np.sin(2 * np.pi * X_temporal['dayofweek'] / 7)
X_temporal['dayofweek_cos'] = np.cos(2 * np.pi * X_temporal['dayofweek'] / 7)

# Drop the original 'hour' and 'dayofweek' columns
X_temporal = X_temporal.drop(['hour', 'dayofweek'], axis=1)

# Encode spatial categoricals
X_spatial = pd.get_dummies(X_spatial)

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Classes:", list(le.classes_))


X_full = np.hstack([X_spatial.values, X_temporal.values, X_env.values])

# Impute missing values before applying SMOTE
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_full_imputed = imputer.fit_transform(X_full)

# Identify and remove classes with only one sample
unique_classes, class_counts = np.unique(y_encoded, return_counts=True)
classes_to_keep = unique_classes[class_counts > 1]

# Filter data to keep only samples from classes with more than one instance
filtered_indices = np.isin(y_encoded, classes_to_keep)
X_filtered = X_full_imputed[filtered_indices]
y_filtered = y_encoded[filtered_indices]

# Apply SMOTE on the filtered data
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_filtered, y_filtered)
print("After SMOTE:", np.bincount(y_resampled))

# Recover shapes
spatial_dim = X_spatial.shape[1]
temporal_dim = X_temporal.shape[1]
env_dim = X_env.shape[1]

X_spatial_resampled = X_resampled[:, :spatial_dim]
X_temporal_resampled = X_resampled[:, spatial_dim:spatial_dim+temporal_dim]
X_env_resampled = X_resampled[:, spatial_dim+temporal_dim:]


X_s_train, X_s_test, X_t_train, X_t_test, X_e_train, X_e_test, y_train, y_test = train_test_split(
    X_spatial_resampled, X_temporal_resampled, X_env_resampled, y_resampled, test_size=0.2, random_state=42)

X_s_train, X_s_val, X_t_train, X_t_val, X_e_train, X_e_val, y_train, y_val = train_test_split(
    X_s_train, X_t_train, X_e_train, y_train, test_size=0.1, random_state=42)

# Scale environmental features after splitting
scaler = StandardScaler()
X_e_train = scaler.fit_transform(X_e_train)
X_e_val = scaler.transform(X_e_val)
X_e_test = scaler.transform(X_e_test)


n_classes = len(le.classes_)

# Spatial
input_s = Input(shape=(X_s_train.shape[1],), name='spatial_input')
x1 = Dense(256, activation='relu')(input_s)
x1 = Dropout(0.5)(x1)
x1 = Dense(256, activation='relu')(x1)
x1 = Dropout(0.5)(x1)
x1 = Dense(128, activation='relu')(x1)

# Temporal
input_t = Input(shape=(X_t_train.shape[1],), name='temporal_input')
x2 = Dense(256, activation='relu')(input_t)
x2 = Dropout(0.5)(x2)
x2 = Dense(256, activation='relu')(x2)
x2 = Dropout(0.5)(x2)
x2 = Dense(128, activation='relu')(x2)

# Environmental
input_e = Input(shape=(X_e_train.shape[1],), name='environmental_input')
x3 = Dense(256, activation='relu')(input_e)
x3 = Dropout(0.5)(x3)
x3 = Dense(256, activation='relu')(x3)
x3 = Dropout(0.5)(x3)
x3 = Dense(128, activation='relu')(x3)

# Merge
merged = concatenate([x1, x2, x3])
z = Dense(1024, activation='relu')(merged)
z = Dropout(0.5)(z)
z = Dense(1024, activation='relu')(z)
z = Dropout(0.5)(z)
output = Dense(n_classes, activation='softmax')(z)

# Model
model = Model(inputs=[input_s, input_t, input_e], outputs=output)
model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Classes: ['Animal Cruelty', 'Arson', 'Assault Offenses', 'Bribery', 'Burglary/Breaking & Entering', 'Counterfeiting/Forgery', 'Destruction/Damage/Vandalism of Property', 'Drug/Narcotic Offenses', 'Embezzlement', 'Extortion/Blackmail', 'Fraud Offenses', 'Gambling Offenses', 'Homicide Offenses', 'Human Trafficking', 'Kidnapping/Abduction', 'Larceny/Theft Offenses', 'Motor Vehicle Theft', 'Pornography/Obscene Material', 'Prostitution Offenses', 'Robbery', 'Sex Offenses', 'Sex Offenses, Non-forcible', 'Stolen Property Offenses', 'Weapon Law Violations']
After SMOTE: [121477 121477 121477 121477 121477 121477 121477 121477 121477 121477
 121477 121477 121477 121477 121477 121477 121477 121477 121477 121477
 121477 121477 121477 121477]
After SMOTE: [121477 121477 121477 121477 121477 121477 121477 121477 121477 121477
 121477 121477 121477 121477 121477 121477 121477 121477 121477 121477
 121477 121477 121477 121477]


In [5]:
# Define callbacks
checkpoint_callback = ModelCheckpoint(
    filepath='best_model.keras',  # Save in the new Keras format
    save_best_only=True,
    monitor='val_loss',
    verbose=1
)

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=15,  # Increased patience
    verbose=1,
    restore_best_weights=True
)

# Recompile model with a different optimizer and learning rate
model.compile(optimizer=Adam(learning_rate=0.0005),  # Smaller learning rate
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model with the updated callbacks and optimizer
history = model.fit(
    [X_s_train, X_t_train, X_e_train], y_train,
    validation_data=([X_s_val, X_t_val, X_e_val], y_val),
    epochs=50,  # Shortened epochs
    batch_size=256,
    callbacks=[checkpoint_callback, early_stopping_callback]
)

Epoch 1/50
[1m8198/8200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 25ms/step - accuracy: 0.0419 - loss: 3.1782
Epoch 1: val_loss improved from inf to 3.17810, saving model to best_model.keras
[1m8200/8200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 26ms/step - accuracy: 0.0419 - loss: 3.1782 - val_accuracy: 0.0423 - val_loss: 3.1781
Epoch 2/50
[1m8198/8200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 25ms/step - accuracy: 0.0414 - loss: 3.1781
Epoch 2: val_loss improved from 3.17810 to 3.17806, saving model to best_model.keras
[1m8200/8200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 26ms/step - accuracy: 0.0414 - loss: 3.1781 - val_accuracy: 0.0428 - val_loss: 3.1781
Epoch 3/50
[1m8198/8200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 25ms/step - accuracy: 0.0517 - loss: 3.1397
Epoch 3: val_loss improved from 3.17806 to 1.97910, saving model to best_model.keras
[1m8200/8200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0

In [3]:
import joblib
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']

In [4]:
import joblib

# Save the StandardScaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [5]:
# Persist imputer and spatial dummy columns
import joblib
joblib.dump(imputer, 'imputer.pkl')
ohe_columns = X_spatial.columns.tolist()
joblib.dump(ohe_columns, 'ohe_columns.pkl')

print("Saved imputer.pkl and ohe_columns.pkl")

Saved imputer.pkl and ohe_columns.pkl


In [6]:
# Save model in compatibility format for TensorFlow 2.13.1
import tensorflow as tf

# Use the legacy format for compatibility
model.save('crime_prediction_model.keras', save_format='keras')
print("Model saved in compatible format as crime_prediction_model.keras")

# Also check current versions
print(f"Current TensorFlow version: {tf.__version__}")
print(f"Current numpy version: {np.__version__}")

import sklearn, joblib
print(f"Current scikit-learn version: {sklearn.__version__}")
print(f"Current joblib version: {joblib.__version__}")



Model saved in compatible format as crime_prediction_model.keras
Current TensorFlow version: 2.18.1
Current numpy version: 2.0.1
Current scikit-learn version: 1.6.1
Current joblib version: 1.5.1
