# Model Benchmarks
---
This notebook establishes a baseline by implementing and benchmarking basic models, providing a foundation for further model development and tuning.

In [1]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from itertools import combinations

import numpy as np
import random
# Set seed
np.random.seed(42)
random.seed(42)

## Load Data

In [2]:
df = pd.read_csv('../data/data.csv')
#df=pd.read_csv('../data/data_engineered.csv')
df.head()

Unnamed: 0,state,year,total_pop,white_pop,black_pop,hispanic_pop,asian_pop,native_pop,islander_pop,multi_race_pop,...,rate:__crimes_against_society,rate:__fraud_and_other_financial_crimes,rate:__property_crime,rate:__violent_crime,youth_not_in_school,youth_in_foster_care,youth_living_in_poverty,total_crime_count,total_crime_rate,log_total_crime_rate
0,AL,2022,4916000,0.645,0.252,0.048,0.014,0.002,0.0,0.039,...,189.36,36.01,253.48,298.96,73000.0,819.25,240000.0,39465.0,777.81,6.657767
1,AL,2021,4900800,0.644,0.255,0.047,0.013,0.004,0.0,0.037,...,128.17,24.48,219.86,218.9,86000.0,819.25,245000.0,29868.0,591.41,6.384199
2,AL,2020,4833950,0.649,0.26,0.0455,0.0135,0.004,0.0,0.028,...,13.0,1.79,12.91,16.67,82000.0,819.25,249833.333333,2233.0,44.37,3.814851
3,AL,2019,4767100,0.654,0.265,0.044,0.014,0.004,0.0,0.019,...,3.74,1.21,6.8,4.32,74000.0,800.0,228000.0,808.0,16.07,2.837323
4,AL,2018,4752600,0.656,0.265,0.043,0.013,0.01,0.0,0.019,...,1.31,0.62,1.47,2.09,86000.0,829.0,255000.0,276.0,5.49,1.870263


In [3]:
df.shape

(312, 34)

### Features and Target Variable

In [4]:
# Define features (X) and target variable (y)

# columns to exclude: population-related and crime-related features
crime_related_columns = [
    'count:__crimes_against_society', 
    'count:__fraud_and_other_financial_crimes', 
    'count:__property_crime', 
    'count:__violent_crime',
    'rate:__crimes_against_society', 
    'rate:__fraud_and_other_financial_crimes', 
    'rate:__property_crime', 
    'rate:__violent_crime',
    'total_crime_count',
    'total_crime_rate',
    'log_total_crime_rate'
]

population_related_columns = [
    'total_pop',
    'white_pop',
    'black_pop',
    'hispanic_pop',
    'asian_pop',
    'native_pop',
    'islander_pop',
    'multi_race_pop','state'
]



#features to keep: exclude population-related and crime-related features
X = df.drop(columns=crime_related_columns + population_related_columns)

# Display the list of features used for modeling
filtered_features = X.columns.tolist()
print(filtered_features)


y = df['log_total_crime_rate']



['year', 'median_income', 'poverty_rate', 'unemployment_rate', 'unemployed_15_weeks', 'labor_force_participation_rate', 'hs_grad_rate', 'bachelors_grad_rate', 'zhvi', 'crude_rate_suicide', 'crude_rate_od', 'youth_not_in_school', 'youth_in_foster_care', 'youth_living_in_poverty']


### Train-Test Split

In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Create Models

### Baseline Model:

In [16]:
# Calculate the mean and median baselines for `total_crime_count`
mean_baseline = y.mean()


# Generate predictions using mean and median baselines
y_mean_baseline_pred = [mean_baseline] * len(y)


# Calculate MSE and R² for mean baseline
mse_mean_baseline = mean_squared_error(y, y_mean_baseline_pred)
mae_mean_baseline = mean_absolute_error(y, y_mean_baseline_pred)
r2_mean_baseline = r2_score(y, y_mean_baseline_pred)

print("Baseline Model:", mean_baseline)
print("MSE Baseline Score:", mse_mean_baseline)
print("MAE Baseline Score ", mae_mean_baseline)
print("R² Baseline Score :", r2_mean_baseline)


Baseline Model: 5.996803128350241
MSE Baseline Score: 1.2580082053817105
MAE Baseline Score  0.7864282445041606
R² Baseline Score : 0.0


--------------------------------------

### Ridge Regression Model

In [7]:
# Create Ridge Regression model
model = Pipeline([
   # ('preprocessor', preprocessor),
    ('sc',StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])
# Train the model
model.fit(X_train, y_train)
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Evaluate Model
mse_train =mean_squared_error(y_train, y_train_pred)
mae_train=mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
# Output scores
print("Ridge Regression Model:")
print("MSE Score (Training):", mse_train)
print("MAE Score (Training):", mae_train)
print("R² Score (Training):", r2_train)
print("MSE Score (Testing):", mse_test)
print("MAE Score (Testing):", mae_test)
print("R² Score (Testing):", r2_test)

Ridge Regression Model:
MSE Score (Training): 0.979583555688467
MAE Score (Training): 0.6761169476110025
R² Score (Training): 0.19271666797949016
MSE Score (Testing): 1.2881751590827175
MAE Score (Testing): 0.849438624241648
R² Score (Testing): 0.08844831548973642


- The testing MSE is slightly higher than the training MSE, indicating that the model generalizes reasonably well, with only a modest increase in error on new data.
- The testing MAE is slightly higher, which means the model’s predictions for unseen data have slightly more error compared to training data, though the difference is not large.
- The R² score on both training and testing data is low, suggesting that the model only captures a small portion of the variance in the data. The positive R² score on the test set (0.08) indicates that the model performs slightly better than a mean baseline, but the low values mean it struggles to explain the patterns in the data.

---------

### Linear Regression Model

In [8]:
model = Pipeline([
    #('preprocessor', preprocessor),
    ('linear', LinearRegression())
])
model.fit(X_train, y_train)
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Evaluate Model
mse_train =mean_squared_error(y_train, y_train_pred)
mae_train=mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
# Output the R² scores
print("Linear  Regression Model:")
print("MSE Score (Training):", mse_train)
print("MAE Score (Training):", mae_train)
print("R² Score (Training):", r2_train)
print("MSE Score (Testing):", mse_test)
print("MAE Score (Testing):", mae_test)
print("R² Score (Testing):", r2_test)

Linear  Regression Model:
MSE Score (Training): 0.9795457991580537
MAE Score (Training): 0.6760030586939583
R² Score (Training): 0.19274778346473953
MSE Score (Testing): 1.2830250483571015
MAE Score (Testing): 0.8473487971196395
R² Score (Testing): 0.0920926895286619


- The MSE is slightly higher on the test set than on the training set, indicating that while the model performs reasonably on the training data, it has some difficulty generalizing to new data.
- The MAE shows a similar pattern, with a moderate increase on the test set, indicating that predictions for unseen data are somewhat less accurate but still relatively close to the training performance.
- Both R² scores are low, meaning the model only explains 19% of the variance on training data and 9% on testing data. While the positive R² score on the test set indicates the model does marginally better than a mean prediction, the low values suggest that the model is not capturing much of the underlying pattern in the data.


### Test 2

In [10]:
# Define all features
features = [
    'white_pop', 'black_pop', 'hispanic_pop', 'asian_pop',
    'poverty_rate', 'unemployment_rate', 'labor_force_participation_rate',
    'hs_grad_rate', 'bachelors_grad_rate', 'zhvi', 'crude_rate_suicide',
    'crude_rate_od', 'youth_not_in_school', 'youth_in_foster_care',
    'youth_living_in_poverty' 
]
target = 'log_total_crime_rate'

X = df[features]
y = df[target]

In [11]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [13]:
# Function to evaluate a Ridge Regression model given a set of features
def evaluate_features(feature_subset):
    X_subset = df[list(feature_subset)]
    X_subset_scaled = scaler.fit_transform(X_subset)
    X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(X_subset_scaled, y, test_size=0.2, random_state=42)
    
    ridge_model = Ridge(alpha=1.0)
    ridge_model.fit(X_train_subset, y_train_subset)

    y_train_pred = ridge_model.predict(X_train_subset)
    y_test_pred = ridge_model.predict(X_test_subset)

    r2_train = r2_score(y_train_subset, y_train_pred)
    r2_test = r2_score(y_test_subset, y_test_pred)

    return r2_train, r2_test

In [14]:
# Iterate over different combinations of features to find the best set
best_r2_score = -float('inf')
best_feature_set = None

# Iterate through feature subsets (testing all possible subsets would be computationally expensive, so we use subsets of 3-5 features)
for i in range(3, 6):
    for feature_combination in combinations(features, i):
        r2_train, r2_test = evaluate_features(feature_combination)
        if r2_test > best_r2_score:
            best_r2_score = r2_test
            best_feature_set = feature_combination

# Output the best feature set and its R² score
print("Best Feature Set:", best_feature_set)
print("Best R² Score (Testing):", best_r2_score)

Best Feature Set: ('black_pop', 'asian_pop', 'bachelors_grad_rate', 'zhvi', 'youth_in_foster_care')
Best R² Score (Testing): 0.2194374100188795
