In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import random
import pickle
# Set seed
np.random.seed(42)
random.seed(42)
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [3]:
# Load the dataset into a DataFrame
data = pd.read_csv('../data/data.csv')

data.head()
data.shape

(312, 34)

In [4]:
# Define features (X) and target variable (y)

# columns to exclude: population-related and crime-related features
crime_related_columns = [
    'count:__crimes_against_society', 
    'count:__fraud_and_other_financial_crimes', 
    'count:__property_crime', 
    'count:__violent_crime',
    'rate:__crimes_against_society', 
    'rate:__fraud_and_other_financial_crimes', 
    'rate:__property_crime', 
    'rate:__violent_crime',
    'total_crime_count',
    'total_crime_rate',
    'log_total_crime_rate'
]

population_related_columns = [
    'total_pop',
    'white_pop',
    'black_pop',
    'hispanic_pop',
    'asian_pop',
    'native_pop',
    'islander_pop',
    'multi_race_pop',
    'state'
]

#features to keep: exclude population-related and crime-related features
X = data.drop(columns=crime_related_columns + population_related_columns)

y = data['log_total_crime_rate']

In [12]:
X.head()

Unnamed: 0,year,median_income,poverty_rate,unemployment_rate,unemployed_15_weeks,labor_force_participation_rate,hs_grad_rate,bachelors_grad_rate,zhvi,crude_rate_suicide,crude_rate_od,youth_not_in_school,youth_in_foster_care,youth_living_in_poverty
0,2022,59910,0.136,0.025,0.012,0.57,0.888,0.288,217335.1989,11.9,12.5,73000.0,819.25,240000.0
1,2021,56930,0.159,0.034,0.02,0.566,0.879,0.274,193148.7979,14.2,14.6,86000.0,819.25,245000.0
2,2020,54690,0.149,0.064,0.014,0.572,0.88,0.278,169855.5743,15.2,12.8,82000.0,819.25,249833.333333
3,2019,56200,0.129,0.032,0.014,0.577,0.871,0.263,157202.1806,14.8,8.3,74000.0,800.0,228000.0
4,2018,49940,0.16,0.039,0.016,0.573,0.866,0.255,148927.5094,17.0,6.7,86000.0,829.0,255000.0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)


print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (249, 14)
X_test shape: (63, 14)
y_train shape: (249,)
y_test shape: (63,)


In [6]:
#https://scikit-learn.org/1.5/auto_examples/ensemble/plot_gradient_boosting_early_stopping.html

# Pipeline setup
gb_pipeline = Pipeline(steps=[

    ('gb', GradientBoostingRegressor(n_estimators=1000, max_depth=5, learning_rate=0.1, random_state=42))
])

# Fit the pipeline to the training data
gb_pipeline.fit(X_train, y_train)

# Prediction
y_train_pred = gb_pipeline.predict(X_train)
y_test_pred = gb_pipeline.predict(X_test)

# Calculate training and testing scoresa
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

results = {
    "Dataset": ["Training", "Testing"],
    "MSE": [round(train_mse, 2), round(test_mse, 2)],
    "MAE": [round(train_mae, 2), round(test_mae, 2)],
    "R2": [round(train_r2, 2), round(test_r2, 2)]
}

# Create a DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Dataset,MSE,MAE,R2
0,Training,0.0,0.0,1.0
1,Testing,0.61,0.53,0.56


In [11]:
# Save the trained model using pickle
with open('gradient_boosting_model.pkl', 'wb') as model_file:
    pickle.dump(gb_pipeline, model_file)