In [19]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from itertools import combinations

In [11]:
df = pd.read_csv('../data/data.csv')
df.head()

Unnamed: 0,state,year,total_pop,white_pop,black_pop,hispanic_pop,asian_pop,native_pop,islander_pop,multi_race_pop,...,crimes_against_society,fraud_and_other_financial_crimes,property_crime,violent_crime,youth_not_in_school,youth_in_foster_care,youth_living_in_poverty,total_crime_count,log_total_crime_count,youth_school_poverty_interaction
0,AL,2022,4916000,0.645,0.252,0.048,0.014,0.002,0.0,0.039,...,4804.0,913.5,6430.5,7584.5,73000.0,819.25,240000.0,19732.5,9.890073,17520000000.0
1,AL,2021,4900800,0.644,0.255,0.047,0.013,0.004,0.0,0.037,...,3236.5,618.0,5552.0,5527.5,86000.0,819.25,245000.0,14934.0,9.611463,21070000000.0
2,AL,2020,4833950,0.649,0.26,0.0455,0.0135,0.004,0.0,0.028,...,327.0,45.0,325.0,419.5,82000.0,819.25,249833.333333,1116.5,7.018849,20486330000.0
3,AL,2019,4767100,0.654,0.265,0.044,0.014,0.004,0.0,0.019,...,94.0,30.5,171.0,108.5,74000.0,800.0,228000.0,404.0,6.003887,16872000000.0
4,AL,2018,4752600,0.656,0.265,0.043,0.013,0.01,0.0,0.019,...,33.0,15.5,37.0,52.5,86000.0,829.0,255000.0,138.0,4.934474,21930000000.0


In [12]:
df.shape

(357, 30)

In [13]:
df.isna().sum()

state                                0
year                                 0
total_pop                            0
white_pop                            0
black_pop                            0
hispanic_pop                         0
asian_pop                            0
native_pop                           0
islander_pop                         0
multi_race_pop                       0
median_income                        0
poverty_rate                         0
unemployment_rate                    0
unemployed_15_weeks                  0
labor_force_participation_rate       0
hs_grad_rate                         0
bachelors_grad_rate                  0
zhvi                                 0
crude_rate_suicide                  12
crude_rate_od                       28
crimes_against_society              45
fraud_and_other_financial_crimes    45
property_crime                      45
violent_crime                       45
youth_not_in_school                  0
youth_in_foster_care     

In [14]:
#filling all null with 0 temporary
df.fillna(0, inplace=True)

In [15]:
df.isna().sum()

state                               0
year                                0
total_pop                           0
white_pop                           0
black_pop                           0
hispanic_pop                        0
asian_pop                           0
native_pop                          0
islander_pop                        0
multi_race_pop                      0
median_income                       0
poverty_rate                        0
unemployment_rate                   0
unemployed_15_weeks                 0
labor_force_participation_rate      0
hs_grad_rate                        0
bachelors_grad_rate                 0
zhvi                                0
crude_rate_suicide                  0
crude_rate_od                       0
crimes_against_society              0
fraud_and_other_financial_crimes    0
property_crime                      0
violent_crime                       0
youth_not_in_school                 0
youth_in_foster_care                0
youth_living

In [24]:
# Define features (X) and target variable (y)
features = [
    'state', 'year', 'total_pop', 'white_pop', 'black_pop', 'hispanic_pop', 'asian_pop',
    'native_pop', 'islander_pop', 'multi_race_pop', 'median_income', 'poverty_rate',
    'unemployment_rate', 'unemployed_15_weeks', 'labor_force_participation_rate',
    'hs_grad_rate', 'bachelors_grad_rate', 'zhvi', 'crude_rate_suicide',
    'crude_rate_od', 'youth_not_in_school', 'youth_in_foster_care',
    'youth_living_in_poverty', 'youth_school_poverty_interaction'
]
target = 'log_total_crime_count'

X = df[features]
y = df[target]

In [25]:
# Preprocessing: Encode year, one-hot encode state, and standardize numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['state']),
        ('ordinal', OrdinalEncoder(), ['year']),
        ('num', StandardScaler(), [
            'total_pop', 'white_pop', 'black_pop', 'hispanic_pop', 'asian_pop', 'native_pop', 
            'islander_pop', 'multi_race_pop', 'median_income', 'poverty_rate', 
            'unemployment_rate', 'unemployed_15_weeks', 'labor_force_participation_rate', 
            'hs_grad_rate', 'bachelors_grad_rate', 'zhvi', 'crude_rate_suicide', 
            'crude_rate_od', 'youth_not_in_school', 'youth_in_foster_care', 
            'youth_living_in_poverty', 'youth_school_poverty_interaction'
        ])
    ]
)

In [26]:
# Create Ridge Regression model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('ridge', Ridge(alpha=1.0))
])

In [27]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Train the model
model.fit(X_train, y_train)

In [29]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [30]:
# Calculate R² scores
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

In [31]:
# Output the R² scores
print("R² Score (Training):", r2_train)
print("R² Score (Testing):", r2_test)

R² Score (Training): 0.6715290742237212
R² Score (Testing): 0.49763273013641085


In [32]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('linear', LinearRegression())
])

In [33]:
model.fit(X_train, y_train)

In [34]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [35]:
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print("R² Score (Training):", r2_train)
print("R² Score (Testing):", r2_test)

R² Score (Training): 0.7010730274723567
R² Score (Testing): 0.466820871861632


### Test 2

In [38]:
# Define all features
features = [
    'white_pop', 'black_pop', 'hispanic_pop', 'asian_pop',
    'poverty_rate', 'unemployment_rate', 'labor_force_participation_rate',
    'hs_grad_rate', 'bachelors_grad_rate', 'zhvi', 'crude_rate_suicide',
    'crude_rate_od', 'youth_not_in_school', 'youth_in_foster_care',
    'youth_living_in_poverty', 'youth_school_poverty_interaction'
]
target = 'log_total_crime_count'

X = df[features]
y = df[target]

In [39]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [40]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [41]:
# Function to evaluate a Ridge Regression model given a set of features
def evaluate_features(feature_subset):
    X_subset = df[list(feature_subset)]
    X_subset_scaled = scaler.fit_transform(X_subset)
    X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(X_subset_scaled, y, test_size=0.2, random_state=42)
    
    ridge_model = Ridge(alpha=1.0)
    ridge_model.fit(X_train_subset, y_train_subset)

    y_train_pred = ridge_model.predict(X_train_subset)
    y_test_pred = ridge_model.predict(X_test_subset)

    r2_train = r2_score(y_train_subset, y_train_pred)
    r2_test = r2_score(y_test_subset, y_test_pred)

    return r2_train, r2_test

In [42]:
# Iterate over different combinations of features to find the best set
best_r2_score = -float('inf')
best_feature_set = None

# Iterate through feature subsets (testing all possible subsets would be computationally expensive, so we use subsets of 3-5 features)
for i in range(3, 6):
    for feature_combination in combinations(all_features, i):
        r2_train, r2_test = evaluate_features(feature_combination)
        if r2_test > best_r2_score:
            best_r2_score = r2_test
            best_feature_set = feature_combination

# Output the best feature set and its R² score
print("Best Feature Set:", best_feature_set)
print("Best R² Score (Testing):", best_r2_score)

Best Feature Set: ('asian_pop', 'native_pop', 'multi_race_pop', 'median_income', 'zhvi')
Best R² Score (Testing): 0.3460566345487124
