In [83]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
import statsmodels.api as sm

In [84]:
master = pd.read_csv('master1.csv')
master.head()

  master = pd.read_csv('master1.csv')


Unnamed: 0,ID,Case Number,Date,Time,Block,IUCR,Primary Type,Description,Location Description,Arrest,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,13210072,JG422242,8/11/23,11:00:00 AM,0000X S ALBANY AVE,1790,OFFENSE INVOLVING CHILDREN,CHILD ABDUCTION,RESIDENCE,False,...,5.7,8.0,75.0,2023-08-11T05:54:19,2023-08-11T19:56:35,0.84,"Rain, Partially cloudy",Partly cloudy throughout the day with morning ...,rain,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
1,13278080,JG500620,11/11/23,12:00:00 AM,038XX W MADISON ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,...,4.6,5.0,10.0,2023-11-11T06:35:05,2023-11-11T16:33:33,0.95,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
2,13276884,JG503649,11/10/23,5:30:00 PM,043XX W AUGUSTA BLVD,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,...,3.6,5.0,10.0,2023-11-10T06:33:51,2023-11-10T16:34:34,0.91,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
3,12990873,JG161829,8/17/19,1:14:00 PM,008XX N KARLOV AVE,1751,OFFENSE INVOLVING CHILDREN,CRIMINAL SEXUAL ABUSE BY FAMILY MEMBER,RESIDENCE,True,...,8.3,3.0,,2019-08-17T06:00:26,2019-08-17T19:48:05,0.57,"Rain, Partially cloudy",Partly cloudy throughout the day with rain in ...,rain,"72534014819,KORD,KMDW,72530094846,74466504838,..."
4,26262,JE366265,9/8/21,4:45:00 PM,047XX W HARRISON ST,110,HOMICIDE,FIRST DEGREE MURDER,CAR WASH,True,...,7.0,6.0,,2021-09-08T06:23:43,2021-09-08T19:11:43,0.06,Partially cloudy,Becoming cloudy in the afternoon.,partly-cloudy-day,"72534014819,C8740,KORD,KMDW,72530094846,F1983,..."


In [85]:
master['Primary Type'].unique()

array(['OFFENSE INVOLVING CHILDREN', 'MOTOR VEHICLE THEFT', 'HOMICIDE',
       'NARCOTICS', 'DECEPTIVE PRACTICE', 'KIDNAPPING', 'BATTERY',
       'ASSAULT', 'PUBLIC PEACE VIOLATION', 'ROBBERY', 'CRIMINAL DAMAGE',
       'THEFT', 'BURGLARY', 'WEAPONS VIOLATION', 'SEX OFFENSE',
       'OTHER OFFENSE', 'CRIMINAL SEXUAL ASSAULT', 'CRIMINAL TRESPASS',
       'INTERFERENCE WITH PUBLIC OFFICER',
       'CONCEALED CARRY LICENSE VIOLATION', 'ARSON', 'STALKING',
       'LIQUOR LAW VIOLATION', 'PROSTITUTION', 'INTIMIDATION', 'GAMBLING',
       'CRIM SEXUAL ASSAULT', 'OBSCENITY', 'RITUALISM',
       'PUBLIC INDECENCY', 'OTHER NARCOTIC VIOLATION',
       'HUMAN TRAFFICKING', 'NON - CRIMINAL', 'NON-CRIMINAL',
       'NON-CRIMINAL (SUBJECT SPECIFIED)'], dtype=object)

In [86]:
# Violent crimes, homicide, batterey, assault, robbery, criminal sexual assault
master = master[master['Primary Type'].isin(['HOMICIDE', 'BATTERY', 'ASSAULT', 'ROBBERY', 'CRIMINAL SEXUAL ASSAULT'])]

In [87]:
# feature changing
columns_drop = ['Case Number', 'Time', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic','Beat','District','Ward','Community Area','FBI Code','X Coordinate',
                'Y Coordinate','Updated On','Latitude','Longitude','Location', 'Holiday Day of Week','precipprob','snowdepth','preciptype', 'windgust','winddir',
                'solarenergy','sunrise', 'sunset','moonphase', 'description', 'icon','stations']
data = master.drop(columns=columns_drop)
data.head()

Unnamed: 0,ID,Date,FullMoon,Holiday,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,...,precipcover,snow,windspeed,sealevelpressure,cloudcover,visibility,solarradiation,uvindex,severerisk,conditions
4,26262,9/8/21,0,,26.7,15.0,21.0,26.3,15.0,21.0,...,0.0,0.0,30.7,1009.0,32.6,16.0,82.9,6.0,,Partially cloudy
8,13201805,9/6/23,0,,27.1,20.2,23.7,27.5,20.2,23.7,...,29.17,0.0,28.8,1007.8,88.1,15.4,32.5,3.0,30.0,"Rain, Partially cloudy"
9,13201684,9/6/23,0,,27.1,20.2,23.7,27.5,20.2,23.7,...,29.17,0.0,28.8,1007.8,88.1,15.4,32.5,3.0,30.0,"Rain, Partially cloudy"
13,13202316,9/6/23,0,,27.1,20.2,23.7,27.5,20.2,23.7,...,29.17,0.0,28.8,1007.8,88.1,15.4,32.5,3.0,30.0,"Rain, Partially cloudy"
14,13201430,9/6/23,0,,27.1,20.2,23.7,27.5,20.2,23.7,...,29.17,0.0,28.8,1007.8,88.1,15.4,32.5,3.0,30.0,"Rain, Partially cloudy"


In [88]:
# Replace NaN values with 0 indicating no holiday and severe risk
data['Holiday'] = data['Holiday'].fillna(0)
data['severerisk'] = data['severerisk'].fillna(0)

# Replace non-NaN values with 1 indicating a holiday
data.loc[data['Holiday'] != 0, 'Holiday'] = 1

# Optionally, convert the 'Holiday' column to integer type
data['Holiday'] = data['Holiday'].astype(int)

# drop dates before 2010 as weather does not have that data
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%y')
data= data[data['Date'].dt.year >= 2010]

In [89]:
#data.to_csv('test.csv', index=False)

In [90]:
daily_counts = data.groupby('Date').size().reset_index(name='Crime_Count')

# Merge daily_counts with the original DataFrame
merged_data = pd.merge(data, daily_counts, on='Date')

# Drop duplicate rows to keep only one entry per day
final_data = merged_data.drop_duplicates(subset='Date')

# Display the final DataFrame
final_data.head()

Unnamed: 0,ID,Date,FullMoon,Holiday,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,...,snow,windspeed,sealevelpressure,cloudcover,visibility,solarradiation,uvindex,severerisk,conditions,Crime_Count
0,26262,2021-09-08,0,0,26.7,15.0,21.0,26.3,15.0,21.0,...,0.0,30.7,1009.0,32.6,16.0,82.9,6.0,0.0,Partially cloudy,10
10,13201805,2023-09-06,0,0,27.1,20.2,23.7,27.5,20.2,23.7,...,0.0,28.8,1007.8,88.1,15.4,32.5,3.0,30.0,"Rain, Partially cloudy",16
26,13039389,2023-04-12,0,0,28.3,15.1,22.1,27.0,15.1,21.9,...,0.0,37.4,1013.8,14.3,16.0,194.3,8.0,10.0,Clear,12
38,13193026,2023-08-30,0,0,21.4,14.5,18.5,21.4,14.5,18.5,...,0.0,27.7,1017.2,49.8,16.0,62.1,4.0,10.0,Partially cloudy,13
51,13273367,2023-11-10,0,0,8.3,2.1,5.6,5.7,-0.1,2.8,...,0.0,23.1,1023.5,50.8,16.0,40.2,5.0,10.0,Partially cloudy,17


In [91]:
#final_data.to_csv('test.csv', index=False)

In [92]:
final_data['conditions'] = final_data['conditions'].str.split(',').apply(lambda x: [c.strip() for c in x])

# Get the set of all unique conditions
unique_conditions = set(condition for sublist in final_data['conditions'] for condition in sublist)

# Create dummy variables for each unique condition
for condition in unique_conditions:
    final_data.loc[:, condition] = final_data['conditions'].apply(lambda x: 1 if condition in x else 0)

# Drop the original 'Conditions' column
final_data.drop(columns=['conditions'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['conditions'] = final_data['conditions'].str.split(',').apply(lambda x: [c.strip() for c in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data.loc[:, condition] = final_data['conditions'].apply(lambda x: 1 if condition in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [93]:
final_data.to_csv('test.csv', index=False)

# Linear Regression

In [94]:
X = final_data.drop(columns=['Crime_Count', 'ID', 'Date'])

# Extract the target variable
y = final_data['Crime_Count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
X_train.to_csv('test.csv', index=False)

In [96]:
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

In [97]:
model.coef_

array([-0.75412258,  0.33457927,  0.04776313,  0.01857954, -0.13008788,
       -0.0237168 , -0.08073306,  0.26082919,  0.03297904,  0.00654646,
       -0.03846642, -0.00807409, -0.20308776, -0.01104396, -0.01816492,
        0.00943558, -0.00533971,  0.01547728, -0.18739589, -0.08335947,
        0.01145019, -0.84382216, -0.2787239 , -0.14873306, -0.26406881,
        0.38519535,  2.9731204 ])

In [98]:
X_with_intercept = sm.add_constant(X_train)  # Add intercept term
sm_model = sm.OLS(y_train, X_with_intercept).fit()

# Access p-values
p_values = sm_model.pvalues
p_values

const                             3.919691e-02
FullMoon                          4.658717e-02
Holiday                           3.652707e-01
tempmax                           6.615267e-01
tempmin                           8.754946e-01
temp                              6.394888e-01
feelslikemax                      7.641807e-01
feelslikemin                      3.841654e-01
feelslike                         9.355894e-02
dew                               8.596410e-01
humidity                          8.934680e-01
precip                            1.310894e-02
precipcover                       3.167676e-01
snow                              1.025083e-01
windspeed                         3.647905e-01
sealevelpressure                  1.758672e-01
cloudcover                        6.582327e-02
visibility                        9.223093e-01
solarradiation                    1.255327e-14
uvindex                           1.105663e-02
severerisk                        2.015471e-14
Rain         

In [99]:
# Predictions on the testing data
y_pred = model.predict(X_test)

In [100]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 21.902288391023657
R^2 Score: 0.2549848977436512


In [101]:
lasso_cv = LassoCV(cv=1000)
lasso_cv.fit(X_train, y_train)

In [102]:
preds = lasso_cv.predict(X_test)
mean_squared_error(y_test, preds)

21.86900115052326

In [103]:
lasso_cv.coef_

array([-0.        ,  0.        ,  0.        , -0.        ,  0.        ,
        0.0360163 ,  0.        ,  0.0946166 ,  0.        ,  0.01357961,
       -0.02736496, -0.00957127, -0.        , -0.00651399, -0.01347489,
        0.0059287 ,  0.        ,  0.01106318, -0.        , -0.08405413,
        0.        , -0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.        ])