In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

In [3]:
# Load the dataset into a DataFrame
df = pd.read_csv('../data/data.csv')

df.head(2)

Unnamed: 0,state,year,total_pop,white_pop,black_pop,hispanic_pop,asian_pop,native_pop,islander_pop,multi_race_pop,...,rate:__crimes_against_society,rate:__fraud_and_other_financial_crimes,rate:__property_crime,rate:__violent_crime,youth_not_in_school,youth_in_foster_care,youth_living_in_poverty,total_crime_count,total_crime_rate,log_total_crime_rate
0,AL,2022,4916000,0.645,0.252,0.048,0.014,0.002,0.0,0.039,...,189.36,36.01,253.48,298.96,73000.0,819.25,240000.0,39465.0,777.81,6.657767
1,AL,2021,4900800,0.644,0.255,0.047,0.013,0.004,0.0,0.037,...,128.17,24.48,219.86,218.9,86000.0,819.25,245000.0,29868.0,591.41,6.384199


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 34 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   state                                     312 non-null    object 
 1   year                                      312 non-null    int64  
 2   total_pop                                 312 non-null    int64  
 3   white_pop                                 312 non-null    float64
 4   black_pop                                 312 non-null    float64
 5   hispanic_pop                              312 non-null    float64
 6   asian_pop                                 312 non-null    float64
 7   native_pop                                312 non-null    float64
 8   islander_pop                              312 non-null    float64
 9   multi_race_pop                            312 non-null    float64
 10  median_income                         

In [48]:
df['state'].value_counts()

state
AL    7
SC    7
NE    7
NH    7
ND    7
OH    7
OK    7
OR    7
PA    7
RI    7
SD    7
MI    7
TN    7
TX    7
UT    7
VT    7
VA    7
WA    7
WV    7
WI    7
MT    7
MO    7
MA    7
KY    7
AZ    7
AR    7
CO    7
CT    7
DE    7
DC    7
ID    7
IL    7
IN    7
KS    7
IA    7
LA    7
ME    7
MN    5
NM    5
HI    5
GA    5
MD    5
MS    5
NC    4
NV    4
WY    4
NY    3
NJ    2
FL    2
CA    2
AK    2
Name: count, dtype: int64

In [40]:
# Define features excluding crime-related features and racial data
features = [
    'state', 'year', 'total_pop', 'median_income', 'poverty_rate',
    'unemployment_rate', 'labor_force_participation_rate',
    'zhvi', 'crude_rate_suicide',
    'crude_rate_od', 'youth_not_in_school', 'youth_in_foster_care',
    'youth_living_in_poverty'
]
target = 'log_total_crime_rate'

X = df[features]
y = df[target]

In [41]:
# Preprocessing: Encode year, one-hot encode state, and standardize numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['state']),
        ('ordinal', OrdinalEncoder(), ['year']),
        ('num', StandardScaler(), [
            'total_pop', 'median_income', 'poverty_rate', 
            'unemployment_rate', 'labor_force_participation_rate', 
            'zhvi', 'crude_rate_suicide', 
            'crude_rate_od', 'youth_not_in_school', 'youth_in_foster_care', 
            'youth_living_in_poverty'
        ])
    ]
)

In [42]:
# Create Linear Regression model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('linear', LinearRegression())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
# Train the model
model.fit(X_train, y_train)

In [44]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [45]:
# Calculate R2 scores
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

In [46]:
# Output the R2 scores
print("R2 Score (Training):", r2_train)
print("R2 Score (Testing):", r2_test)

R2 Score (Training): 0.7497131794728654
R2 Score (Testing): 0.548296990970565


In [47]:
# Save the trained model using pickle
with open('linear_regression_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)