In [57]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from itertools import combinations

### Load Data

In [59]:
#df = pd.read_csv('../data/data.csv')
df=pd.read_csv('../data/data_engineered.csv')
df.head()

Unnamed: 0,state,year,total_pop,white_pop,black_pop,hispanic_pop,asian_pop,native_pop,islander_pop,multi_race_pop,...,total_crime_count,total_crime_rate,log_total_crime_rate,youth_school_poverty_interaction,poverty_income_interaction,unemployment_youth_interaction,poverty_rate_squared,median_income_squared,poverty_to_income_ratio,youth_to_total_pop_ratio
0,AL,2022,4916000,0.645,0.252,0.048,0.014,0.002,0.0,0.039,...,39465.0,777.81,6.657767,9928.0,8147.76,1825.0,0.018496,3589208100,2e-06,0.014849
1,AL,2021,4900800,0.644,0.255,0.047,0.013,0.004,0.0,0.037,...,29868.0,591.41,6.384199,13674.0,9051.87,2924.0,0.025281,3241024900,3e-06,0.017548
2,AL,2020,4833950,0.649,0.26,0.0455,0.0135,0.004,0.0,0.028,...,2233.0,44.37,3.814851,12218.0,8148.81,5248.0,0.022201,2990996100,3e-06,0.016963
3,AL,2019,4767100,0.654,0.265,0.044,0.014,0.004,0.0,0.019,...,808.0,16.07,2.837323,9546.0,7249.8,2368.0,0.016641,3158440000,2e-06,0.015523
4,AL,2018,4752600,0.656,0.265,0.043,0.013,0.01,0.0,0.019,...,276.0,5.49,1.870263,13760.0,7990.4,3354.0,0.0256,2494003600,3e-06,0.018095


In [60]:
df.shape

(312, 41)

In [61]:
df.isna().sum()

state                                       0
year                                        0
total_pop                                   0
white_pop                                   0
black_pop                                   0
hispanic_pop                                0
asian_pop                                   0
native_pop                                  0
islander_pop                                0
multi_race_pop                              0
median_income                               0
poverty_rate                                0
unemployment_rate                           0
unemployed_15_weeks                         0
labor_force_participation_rate              0
hs_grad_rate                                0
bachelors_grad_rate                         0
zhvi                                        0
crude_rate_suicide                          0
crude_rate_od                               0
count:__crimes_against_society              0
count:__fraud_and_other_financial_

In [62]:
df.columns

Index(['state', 'year', 'total_pop', 'white_pop', 'black_pop', 'hispanic_pop',
       'asian_pop', 'native_pop', 'islander_pop', 'multi_race_pop',
       'median_income', 'poverty_rate', 'unemployment_rate',
       'unemployed_15_weeks', 'labor_force_participation_rate', 'hs_grad_rate',
       'bachelors_grad_rate', 'zhvi', 'crude_rate_suicide', 'crude_rate_od',
       'count:__crimes_against_society',
       'count:__fraud_and_other_financial_crimes', 'count:__property_crime',
       'count:__violent_crime', 'rate:__crimes_against_society',
       'rate:__fraud_and_other_financial_crimes', 'rate:__property_crime',
       'rate:__violent_crime', 'youth_not_in_school', 'youth_in_foster_care',
       'youth_living_in_poverty', 'total_crime_count', 'total_crime_rate',
       'log_total_crime_rate', 'youth_school_poverty_interaction',
       'poverty_income_interaction', 'unemployment_youth_interaction',
       'poverty_rate_squared', 'median_income_squared',
       'poverty_to_income_rati

### Features and Target Variable

In [64]:
# Define features (X) and target variable (y)

# columns to exclude: population-related and crime-related features
crime_related_columns = [
    'count:__crimes_against_society', 
    'count:__fraud_and_other_financial_crimes', 
    'count:__property_crime', 
    'count:__violent_crime',
    'rate:__crimes_against_society', 
    'rate:__fraud_and_other_financial_crimes', 
    'rate:__property_crime', 
    'rate:__violent_crime',
    'total_crime_count',
    'total_crime_rate',
    'log_total_crime_rate'
]

population_related_columns = [
    'total_pop',
    'white_pop',
    'black_pop',
    'hispanic_pop',
    'asian_pop',
    'native_pop',
    'islander_pop',
    'multi_race_pop','state'
]



#features to keep: exclude population-related and crime-related features
X = df.drop(columns=crime_related_columns + population_related_columns)

# Display the list of features used for modeling
filtered_features = X.columns.tolist()
print(filtered_features)


y = df['log_total_crime_rate']



['year', 'median_income', 'poverty_rate', 'unemployment_rate', 'unemployed_15_weeks', 'labor_force_participation_rate', 'hs_grad_rate', 'bachelors_grad_rate', 'zhvi', 'crude_rate_suicide', 'crude_rate_od', 'youth_not_in_school', 'youth_in_foster_care', 'youth_living_in_poverty', 'youth_school_poverty_interaction', 'poverty_income_interaction', 'unemployment_youth_interaction', 'poverty_rate_squared', 'median_income_squared', 'poverty_to_income_ratio', 'youth_to_total_pop_ratio']


### Train-Test Split

In [66]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline Model:

In [68]:
# Calculate the mean and median baselines for `total_crime_count`
mean_baseline = y.mean()


# Generate predictions using mean and median baselines
y_mean_baseline_pred = [mean_baseline] * len(y)


# Calculate MSE and R² for mean baseline
mse_mean_baseline = mean_squared_error(y, y_mean_baseline_pred)
mae_mean_baseline = mean_squared_error(y, y_mean_baseline_pred)
r2_mean_baseline = r2_score(y, y_mean_baseline_pred)


(mse_mean_baseline, mae_mean_baseline,r2_mean_baseline)

(1.2580082053817108, 1.2580082053817108, 0.0)

### Ridge Regression Model

In [70]:
# Create Ridge Regression model
model = Pipeline([
   # ('preprocessor', preprocessor),
    ('sc',StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])
# Train the model
model.fit(X_train, y_train)
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Evaluate Model
mse_train =mean_squared_error(y_train, y_train_pred)
mae_train=mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
# Output scores
print("Ridge Regression Model")
print("MSE Score (Training):", mse_train)
print("MAE Score (Training):", mae_train)
print("R² Score (Training):", r2_train)
print("MSE Score (Testing):", mse_test)
print("MAE Score (Testing):", mae_test)
print("R² Score (Testing):", r2_test)

Ridge Regression Model
MSE Score (Training): 0.9251439258908855
MAE Score (Training): 0.6463641563309233
R² Score (Training): 0.23758084059829976
MSE Score (Testing): 1.7756197044340927
MAE Score (Testing): 0.9373586653868435
R² Score (Testing): -0.25648218040399273


### Linear Regression Model

In [73]:
model = Pipeline([
    #('preprocessor', preprocessor),
    ('linear', LinearRegression())
])
model.fit(X_train, y_train)
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Evaluate Model
mse_train =mean_squared_error(y_train, y_train_pred)
mae_train=mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
# Output the R² scores
print("Linear  Regression Model")
print("MSE Score (Training):", mse_train)
print("MAE Score (Training):", mae_train)
print("R² Score (Training):", r2_train)
print("MSE Score (Testing):", mse_test)
print("MAE Score (Testing):", mae_test)
print("R² Score (Testing):", r2_test)

Linear  Regression Model
MSE Score (Training): 0.9231307883692299
MAE Score (Training): 0.6445172990972622
R² Score (Training): 0.23923988474707114
MSE Score (Testing): 1.8085450355594965
MAE Score (Testing): 0.9387987605777858
R² Score (Testing): -0.27978114005152377


### Test 2

In [76]:
# Define all features
features = [
    'white_pop', 'black_pop', 'hispanic_pop', 'asian_pop',
    'poverty_rate', 'unemployment_rate', 'labor_force_participation_rate',
    'hs_grad_rate', 'bachelors_grad_rate', 'zhvi', 'crude_rate_suicide',
    'crude_rate_od', 'youth_not_in_school', 'youth_in_foster_care',
    'youth_living_in_poverty' 
]
target = 'total_crime_rate'

X = df[features]
y = df[target]

In [77]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [78]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [79]:
# Function to evaluate a Ridge Regression model given a set of features
def evaluate_features(feature_subset):
    X_subset = df[list(feature_subset)]
    X_subset_scaled = scaler.fit_transform(X_subset)
    X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(X_subset_scaled, y, test_size=0.2, random_state=42)
    
    ridge_model = Ridge(alpha=1.0)
    ridge_model.fit(X_train_subset, y_train_subset)

    y_train_pred = ridge_model.predict(X_train_subset)
    y_test_pred = ridge_model.predict(X_test_subset)

    r2_train = r2_score(y_train_subset, y_train_pred)
    r2_test = r2_score(y_test_subset, y_test_pred)

    return r2_train, r2_test

In [80]:
# Iterate over different combinations of features to find the best set
best_r2_score = -float('inf')
best_feature_set = None

# Iterate through feature subsets (testing all possible subsets would be computationally expensive, so we use subsets of 3-5 features)
for i in range(3, 6):
    for feature_combination in combinations(all_features, i):
        r2_train, r2_test = evaluate_features(feature_combination)
        if r2_test > best_r2_score:
            best_r2_score = r2_test
            best_feature_set = feature_combination

# Output the best feature set and its R² score
print("Best Feature Set:", best_feature_set)
print("Best R² Score (Testing):", best_r2_score)

NameError: name 'all_features' is not defined