In [42]:
import pandas as pd
import datetime as dt

#pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
#model
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('Crime_Data_from_2020_to_Present.csv')
pd.options.display.max_columns = None

In [28]:
eda = df.copy()

#dataset with only stolen car rows for EDA
eda = eda[eda['Crm Cd']==510]


premis_codes = [101,108,104,707,123]



In [15]:
csvDF = df[['Crm Cd','Crm Cd Desc']].drop_duplicates().sort_values('Crm Cd')

csvDF.to_csv('crime_codes.csv', index=False)


In [29]:
import folium
from folium.plugins import MarkerCluster, HeatMap

# center map on median location
coords = eda[['LAT', 'LON']].dropna()
center = [coords['LAT'].median(), coords['LON'].median()]

m = folium.Map(location=center, zoom_start=11, tiles='CartoDB positron')

# heatmap for overall density (scales well)
HeatMap(data=coords.values.tolist(), radius=8, blur=15, max_zoom=12).add_to(m)

# marker cluster for a sample (avoid plotting too many individual markers)
max_markers = 5000
sample = eda.sample(n=max_markers, random_state=42) if len(eda) > max_markers else eda
area_counts = eda.groupby('AREA').size().reset_index(name='count').sort_values('count', ascending=False)
area_counts
cluster = MarkerCluster()
for _, row in sample.iterrows():
    if pd.isna(row['LAT']) or pd.isna(row['LON']):
        continue
    popup = f"Desc: {row.get('Crm Cd Desc','')}<br>Date: {row.get('DATE OCC','')}"
    folium.Marker(location=[row['LAT'], row['LON']], popup=popup).add_to(cluster)

m.add_child(cluster)

m

In [45]:
#df filtered by top 5 presmisies for our question
premis_filtered = df[df['Premis Cd'].apply(lambda x: x in premis_codes)]
premis_filtered = pd.DataFrame(premis_filtered)

#### FEATURE ENGINEERING

In [74]:
premis_filtered.drop(columns=['crime_rate'],inplace=True)

KeyError: "['crime_rate'] not found in axis"

In [75]:
premis_filtered['DATE OCC'] = pd.to_datetime(premis_filtered['DATE OCC'], errors='coerce')
premis_filtered['year'] = premis_filtered['DATE OCC'].dt.year
premis_filtered['Month'] = premis_filtered['DATE OCC'].dt.month
premis_filtered['DayOfWeek'] = premis_filtered['DATE OCC'].dt.dayofweek
premis_filtered['hour'] = premis_filtered['TIME OCC'].apply(lambda x: int(str(x).zfill(4)[:2]) if pd.notna(x) else 0)
premis_filtered['IsWeekend'] = premis_filtered['DayOfWeek'].apply(lambda x: 1 if x >=5 else 0)
premis_filtered['IsNight'] = premis_filtered['hour'].apply(lambda x: 1 if x>=20 or x<6 else 0)



premis_filtered['is_stolen'] = premis_filtered['Crm Cd'].apply(lambda x: 1 if x==510 else 0)
premis_filtered

#AVG DAILY CRIME RATE PER AREA
daily = premis_filtered.groupby(['AREA','DATE OCC']).size().reset_index(name='daily_counts')
daily
avg_daily_crime = daily.groupby('AREA')['daily_counts'].mean().reset_index(name='crime_rate')
avg_daily_crime

premis_filtered = premis_filtered.merge(avg_daily_crime, on='AREA', how='inner')

In [None]:
premis_filtered['crime_rate_x'].

array([ 10,   8,  14,   9,  11,   7,  15,  16,  26,   5,  12,  19,  20,
         6,  17,   4,  13,  21,  18,  24,  23,   3,  22,   2,  33,  27,
        28,  30,  25,   1,  31, 147,  70,  35,  40,  29,  32,  37,  36,
        34])

In [50]:
extra = pd.DataFrame(premis_filtered['Premis Desc'].groupby(premis_filtered['AREA NAME']).value_counts())
extra.to_csv('premis_area_counts.csv')

In [80]:
x = len(premis_filtered[premis_filtered['Crm Cd']==510])
y = len(premis_filtered)

ratio = x/y
ratio

premis_filtered

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,Mocodes,Vict Age,Vict Sex,Vict Descent,Premis Cd,Premis Desc,Weapon Used Cd,Weapon Desc,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON,year,Month,DayOfWeek,hour,IsWeekend,is_stolen,IsNight,month,crime_rate
0,210704711,12/24/2020 12:00:00 AM,2020-12-24,1310,7,Wilshire,782,1,331,THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND ...,0344,47,F,A,101.0,STREET,,,IC,Invest Cont,331.0,,,,6000 COMEY AV,,34.0339,-118.3747,2020,12,3,13,0,0,0,2020-12,8.652533
1,202113531,09/06/2020 12:00:00 AM,2020-09-05,1500,21,Topanga,2149,1,510,VEHICLE - STOLEN,,0,,,108.0,PARKING LOT,,,AA,Adult Arrest,510.0,,,,19700 VANOWEN ST,,34.1938,-118.5631,2020,9,5,15,1,1,0,2020-09,7.276771
2,201406733,02/16/2020 12:00:00 AM,2020-02-13,2300,14,Pacific,1406,1,330,BURGLARY FROM VEHICLE,0344 1300,32,M,W,707.0,GARAGE/CARPORT,,,IC,Invest Cont,330.0,,,,3300 MOTOR AV,,34.0240,-118.4090,2020,2,3,23,0,0,1,2020-02,11.940241
3,201820230,11/08/2020 12:00:00 AM,2020-11-08,730,18,Southeast,1844,2,626,INTIMATE PARTNER - SIMPLE ASSAULT,2000 1300 0416 1814 0446 1310 0400,26,F,B,101.0,STREET,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,626.0,,,,108TH ST,AVALON AV,33.9383,-118.2652,2020,11,6,7,1,0,0,2020-11,11.026446
4,202009677,05/13/2020 12:00:00 AM,2020-05-13,900,20,Olympic,2042,1,520,VEHICLE - ATTEMPT STOLEN,1822 1607 0358,36,M,H,101.0,STREET,,,IC,Invest Cont,520.0,,,,900 S WILTON PL,,34.0559,-118.3142,2020,5,2,9,0,0,0,2020-05,10.012623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374098,251304103,02/03/2025 12:00:00 AM,2025-01-31,1800,13,Newton,1313,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",1402,0,M,O,101.0,STREET,,,IC,Invest Cont,740.0,998.0,,,600 E 15TH ST,,34.0312,-118.2564,2025,1,4,18,0,0,0,2025-01,11.647996
374099,250304234,03/01/2025 12:00:00 AM,2025-02-19,1800,3,Southwest,325,1,522,"VEHICLE, STOLEN - OTHER (MOTORIZED SCOOTERS, B...",,0,,,108.0,PARKING LOT,,,IC,Invest Cont,522.0,,,,2600 ELLENDALE PL,,34.0307,-118.2923,2025,2,2,18,0,0,0,2025-02,11.125205
374100,252104125,02/05/2025 12:00:00 AM,2025-02-05,1555,21,Topanga,2149,2,237,CHILD NEGLECT (SEE 300 W.I.C.),,4,F,O,101.0,STREET,,,IC,Invest Cont,237.0,,,,CORBIN,HATTERAS,34.1958,-118.5666,2025,2,2,15,0,0,0,2025-02,7.276771
374101,252104112,02/02/2025 12:00:00 AM,2025-02-02,130,21,Topanga,2103,2,946,OTHER MISCELLANEOUS CRIME,,35,M,X,101.0,STREET,,,IC,Invest Cont,946.0,,,,22100 ROSCOE BL,,34.2259,-118.6126,2025,2,6,1,1,0,1,2025-02,7.276771


In [22]:
for row in premis_codes:
    x = len(premis_filtered[premis_filtered['Premis Cd']==row])
    y = len(premis_filtered)
    ratio = x/y
    print(f'for premis code {row}, ratio is {ratio}')
    


for premis code 101, ratio is 0.7914385617914033
for premis code 108, ratio is 0.13709250167767456
for premis code 104, ratio is 0.03340338360470455
for premis code 707, ratio is 0.03104580934553032
for premis code 123, ratio is 0.007019743580687317


In [103]:
ModelDF = premis_filtered[['Premis Cd', 'AREA', 'Crm Cd', 'hour', 'IsWeekend', 'is_stolen','crime_rate','IsNight']]
X = premis_filtered[['Premis Cd', 'AREA','crime_rate','IsNight','hour','IsWeekend']]
Y = premis_filtered['is_stolen']

categorical = ['Premis Cd', 'AREA']
numeric = ['hour', 'crime_rate']

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', StandardScaler(), numeric)
    ],
    remainder='passthrough'
)

pr_X = preprocess.fit_transform(X)
features = preprocess.get_feature_names_out()


#X_train, X_test, y_train, y_test = train_test_split(pr_X, pr_Y, test_size=0.3, random_state=42)

In [24]:
print(premis_filtered['Status Desc'].value_counts())

Status Desc
Invest Cont     106085
Adult Arrest      4634
Adult Other       2328
Juv Arrest         180
Juv Other           24
UNK                  1
Name: count, dtype: int64
