In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

# Load the data
master = pd.read_csv('master1.csv')

# Filter relevant crime types
relevant_crime_types = ['HOMICIDE', 'BATTERY', 'ASSAULT', 'ROBBERY', 'CRIMINAL SEXUAL ASSAULT']
master = master[master['Primary Type'].isin(relevant_crime_types)]

# Drop irrelevant columns
columns_to_drop = ['Case Number', 'Time', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic','Beat','District','Ward','Community Area','FBI Code','X Coordinate',
                'Y Coordinate','Updated On','Latitude','Longitude','Location', 'Holiday Day of Week','precipprob','snowdepth','preciptype', 'windgust','winddir',
                'solarenergy','sunrise', 'sunset','moonphase', 'description', 'icon','stations']
data = master.drop(columns=columns_to_drop)

# Replace NaN values with 0 indicating no holiday and severe risk
data['Holiday'] = data['Holiday'].fillna(0)
data['severerisk'] = data['severerisk'].fillna(0)

# Replace non-NaN values with 1 indicating a holiday
data['Holiday'] = data['Holiday'].apply(lambda x: 1 if x != 0 else 0)

# Filter data for years after 2010
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%y')
data = data[data['Date'].dt.year >= 2010]

# Group by date and get crime counts
daily_counts = data.groupby('Date').size().reset_index(name='Crime_Count')

# Merge daily counts with the original DataFrame
merged_data = pd.merge(data, daily_counts, on='Date')

# Drop duplicate rows to keep only one entry per day
final_data = merged_data.drop_duplicates(subset='Date')

# Split conditions column and create dummy variables
final_data['conditions'] = final_data['conditions'].str.split(',')
final_data = final_data.join(final_data['conditions'].str.join('|').str.get_dummies())

# Drop unnecessary columns
final_data.drop(columns=['conditions'], inplace=True)

# Save final data to a CSV file

# Split the data into features (X) and target variable (y)
X = final_data.drop(columns=['Crime_Count', 'ID', 'Date'])
y = final_data['Crime_Count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

# Train OLS model
X_with_intercept = sm.add_constant(X_train)  # Add intercept term
sm_model = sm.OLS(y_train, X_with_intercept).fit()

# Access p-values
p_values = sm_model.pvalues
print("P-values:")
print(p_values)

# Print variable names and p-values without scientific notation
for variable, p_value in zip(X.columns, p_values):
    print(f"{variable}: {p_value:.10f}")

  master = pd.read_csv('master1.csv')


Mean Squared Error: 21.922148251273804
R^2 Score: 0.2543093566517385
P-values:
const                              4.104269e-02
FullMoon                           4.583404e-02
Holiday                            3.609208e-01
tempmax                            6.644182e-01
tempmin                            8.609646e-01
temp                               6.197258e-01
feelslikemax                       7.829527e-01
feelslikemin                       3.913791e-01
feelslike                          9.882209e-02
dew                                8.284529e-01
humidity                           9.159977e-01
precip                             1.573460e-02
precipcover                        2.482519e-01
snow                               1.052588e-01
windspeed                          3.640757e-01
sealevelpressure                   1.875899e-01
cloudcover                         4.248900e-02
visibility                         9.513384e-01
solarradiation                     1.144719e-14
uvindex  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['conditions'] = final_data['conditions'].str.split(',')


In [14]:
# Print variable names and p-values less than 0.05
print("Variables with p-values < 0.05:")
for variable, p_value in zip(X.columns, p_values):
    if p_value < 0.05:
        print(f"{variable}: {p_value:.10f}")

Variables with p-values < 0.05:
FullMoon: 0.0410426866
Holiday: 0.0458340438
precipcover: 0.0157345985
visibility: 0.0424889952
uvindex: 0.0000000000
severerisk: 0.0110912312
 Freezing Drizzle/Freezing Rain: 0.0000000000
Overcast: 0.0309655082
Rain: 0.0490352389
