In [38]:
# import libraries

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [18]:
df = pd.read_csv('../data/data.csv')
df.head()

Unnamed: 0,state,year,total_pop,white_pop,black_pop,hispanic_pop,asian_pop,native_pop,islander_pop,multi_race_pop,...,crimes_against_society,fraud_and_other_financial_crimes,property_crime,violent_crime,youth_not_in_school,youth_in_foster_care,youth_living_in_poverty,total_crime_count,log_total_crime_count,youth_school_poverty_interaction
0,AL,2022,4916000,0.645,0.252,0.048,0.014,0.002,0.0,0.039,...,4804.0,913.5,6430.5,7584.5,73000.0,819.25,240000.0,19732.5,9.890073,17520000000.0
1,AL,2021,4900800,0.644,0.255,0.047,0.013,0.004,0.0,0.037,...,3236.5,618.0,5552.0,5527.5,86000.0,819.25,245000.0,14934.0,9.611463,21070000000.0
2,AL,2020,4833950,0.649,0.26,0.0455,0.0135,0.004,0.0,0.028,...,327.0,45.0,325.0,419.5,82000.0,819.25,249833.333333,1116.5,7.018849,20486330000.0
3,AL,2019,4767100,0.654,0.265,0.044,0.014,0.004,0.0,0.019,...,94.0,30.5,171.0,108.5,74000.0,800.0,228000.0,404.0,6.003887,16872000000.0
4,AL,2018,4752600,0.656,0.265,0.043,0.013,0.01,0.0,0.019,...,33.0,15.5,37.0,52.5,86000.0,829.0,255000.0,138.0,4.934474,21930000000.0


In [19]:
df.shape

(357, 30)

In [20]:
df.isna().sum()

state                                0
year                                 0
total_pop                            0
white_pop                            0
black_pop                            0
hispanic_pop                         0
asian_pop                            0
native_pop                           0
islander_pop                         0
multi_race_pop                       0
median_income                        0
poverty_rate                         0
unemployment_rate                    0
unemployed_15_weeks                  0
labor_force_participation_rate       0
hs_grad_rate                         0
bachelors_grad_rate                  0
zhvi                                 0
crude_rate_suicide                  12
crude_rate_od                       28
crimes_against_society              45
fraud_and_other_financial_crimes    45
property_crime                      45
violent_crime                       45
youth_not_in_school                  0
youth_in_foster_care     

In [60]:
#filling all null with 0 temporary
df.fillna(0, inplace=True)

In [71]:
df.isna().sum()

state                               0
year                                0
total_pop                           0
white_pop                           0
black_pop                           0
hispanic_pop                        0
asian_pop                           0
native_pop                          0
islander_pop                        0
multi_race_pop                      0
median_income                       0
poverty_rate                        0
unemployment_rate                   0
unemployed_15_weeks                 0
labor_force_participation_rate      0
hs_grad_rate                        0
bachelors_grad_rate                 0
zhvi                                0
crude_rate_suicide                  0
crude_rate_od                       0
crimes_against_society              0
fraud_and_other_financial_crimes    0
property_crime                      0
violent_crime                       0
youth_not_in_school                 0
youth_in_foster_care                0
youth_living

In [61]:
# Define features (X) and target variable (y)
features = [
    'total_pop', 'white_pop', 'black_pop', 'hispanic_pop', 'asian_pop',
    'native_pop', 'youth_not_in_school', 'youth_in_foster_care', 'youth_living_in_poverty'
]
target = 'log_total_crime_count'

X = df[features]
y = df[target]

In [62]:
# Standardize the features to address scaling and stability issues
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [63]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [64]:
# Baseline Model: Predicting the Mean of the Target Variable
mean_y_train = y_train.mean()

# Make predictions using the mean value for both training and testing sets
y_train_pred_baseline = [mean_y_train] * len(y_train)
y_test_pred_baseline = [mean_y_train] * len(y_test)

In [66]:
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [67]:
# Predict on training and test sets for Linear Regression
y_train_pred_linear = linear_model.predict(X_train)
y_test_pred_linear = linear_model.predict(X_test)

# Evaluation for Linear Regression
r2_train_linear = r2_score(y_train, y_train_pred_linear)
r2_test_linear = r2_score(y_test, y_test_pred_linear)

In [68]:
# Ridge Regression
ridge_model = Ridge()

# Define the grid of alpha values to test
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100, 1000]}

# Setup GridSearchCV
grid_search = GridSearchCV(ridge_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [69]:
# Get the best alpha value
best_alpha = grid_search.best_params_['alpha']
print(f"Best alpha from GridSearchCV: {best_alpha}")

# Evaluate the model with the best hyperparameter
best_ridge_model = grid_search.best_estimator_
y_train_pred = best_ridge_model.predict(X_train)
y_test_pred = best_ridge_model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

Best alpha from GridSearchCV: 0.1


In [70]:
# Output the results
print("Linear Regression:")
print(f"R² Score (Training): {r2_train_linear}")
print(f"R² Score (Testing): {r2_test_linear}\n")

print("Ridge Regression:")
print(f"R² Score (Training): {r2_train}")
print(f"R² Score (Testing): {r2_test}")

Linear Regression:
R² Score (Training): 0.27934608938971395
R² Score (Testing): 0.27079436682565505

Ridge Regression:
R² Score (Training): 0.2790231964336568
R² Score (Testing): 0.27128549685876513
