In [18]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point


In [73]:
crime = pd.read_csv('../01_raw_data/NYPD_Complaint_Data_Historic_BURGLARY.csv')
income = gpd.read_file('../02_intermediate_data/censustract-medianhouseholdincome2022.csv')

In [75]:
import numpy as np

# Clean and preprocess the burglary data
# Convert date and time into standardized format and extract features
crime['CMPLNT_FR_DT'] = pd.to_datetime(crime['CMPLNT_FR_DT'], errors='coerce')
crime['Day_of_Week'] = crime['CMPLNT_FR_DT'].dt.day_of_week
crime['Hour_of_Day'] = pd.to_datetime(crime['CMPLNT_FR_TM'], format='%H:%M:%S', errors='coerce').dt.hour

# Drop rows with invalid dates or times
burglary_data = crime.dropna(subset=['CMPLNT_FR_DT', 'Hour_of_Day'])

# Simplify the dataset to focus on essential columns for merging and analysis
burglary_data_clean = burglary_data[['CMPLNT_NUM', 'CMPLNT_FR_DT', 'Day_of_Week', 'Hour_of_Day', 'Zip Codes','Latitude', 'Longitude']]

In [76]:
burglary_data_clean = burglary_data_clean.dropna(subset=['Latitude', 'Longitude'])

# Create Point geometries from latitude and longitude
geometry = [Point(xy) for xy in zip(burglary_data_clean['Longitude'], burglary_data_clean['Latitude'])]
burglary_gdf = gpd.GeoDataFrame(burglary_data_clean, geometry=geometry)

# Set the coordinate reference system (CRS) to WGS84 (EPSG:4326)
burglary_gdf.crs = "EPSG:4326"

burglary_gdf.head()

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,Day_of_Week,Hour_of_Day,Zip Codes,Latitude,Longitude,geometry
0,261161452,2022-12-31,5.0,3.0,12426.0,40.796477,-73.947411,POINT (-73.94741 40.79648)
1,261179625,2022-12-31,5.0,19.0,18181.0,40.694256,-73.932807,POINT (-73.93281 40.69426)
2,261179648,2022-12-31,5.0,18.0,14192.0,40.796133,-73.820462,POINT (-73.82046 40.79613)
3,261169002,2022-12-31,5.0,4.0,11270.0,40.837813,-73.826563,POINT (-73.82656 40.83781)
4,261175495,2022-12-31,5.0,21.0,10930.0,40.832601,-73.929564,POINT (-73.92956 40.83260)


In [77]:
cencus = gpd.read_file('../02_intermediate_data/Median Income/gpd_med_income.shp')

In [79]:
census_tracts_gdf = cencus.to_crs("EPSG:4326")  # Change CRS if needed to match burglary_gdf
merged_data = gpd.sjoin(burglary_gdf, census_tracts_gdf, how="left", op='intersects')
merged_data = pd.merge(merged_data,income,left_on='GEOID',right_on='Census Tract',how='left')

  if await self.run_code(code, result, async_=asy):


In [107]:
merged_data.columns

Index(['CMPLNT_NUM', 'CMPLNT_FR_DT', 'Day_of_Week', 'Hour_of_Day', 'Zip Codes',
       'Latitude', 'Longitude', 'geometry_x', 'index_right', 'STATEFP',
       'COUNTYFP', 'TRACTCE', 'GEOID', 'GEOIDFQ', 'NAME', 'NAMELSAD', 'MTFCC',
       'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'short_name',
       'long_name', 'Census Tract', '2009-2013', '2014-2018', 'geometry_y'],
      dtype='object')

In [130]:
df1 = merged_data[['Latitude', 'Longitude','2014-2018']].copy()
df1['2014-2018'] = pd.to_numeric(df1['2014-2018'], errors='coerce')
df1['Burglary_Risk'] = 1
df1 = df1.groupby(['Latitude', 'Longitude']).agg({'Burglary_Risk':'sum','2014-2018':'min'}).reset_index()
df1['label'] =1

In [96]:
no_bugrary = pd.read_csv('../01_raw_data/control_coordinates.csv')
no_bugrary[['Latitude','Longitude']] = no_bugrary['Location'].str.split(',',n=2,expand=True)

In [95]:
geometry1 = [Point(xy) for xy in zip(no_bugrary['Longitude'], no_bugrary['Latitude'])]
noburglary_gdf = gpd.GeoDataFrame(no_bugrary, geometry=geometry1)
noburglary_gdf.crs = "EPSG:4326"

In [104]:
merged_data1 = gpd.sjoin(noburglary_gdf, census_tracts_gdf, how="left", op='intersects')
merged_data1 = merged_data1.dropna(subset=['GEOID'])
merged_data1 = pd.merge(merged_data1,income,left_on='GEOID',right_on='Census Tract',how='left')

  if await self.run_code(code, result, async_=asy):


In [125]:
df2 = merged_data1[['Latitude', 'Longitude','2014-2018']].copy()
df2['Burglary_Risk'] = 0
df2 = df2.drop_duplicates()
df2['label'] = 0

In [139]:
data = pd.concat([df1,df2])
data = data.reset_index().drop('index',axis=1)
data['2014-2018'] = pd.to_numeric(data['2014-2018'], errors='coerce')
data = data.dropna()

In [140]:
data.to_csv('../03_cleaned_data/data.csv')

----

In [23]:
data = pd.read_csv('../03_cleaned_data/data.csv')
data

Unnamed: 0.1,Unnamed: 0,Latitude,Longitude,Burglary_Risk,2014-2018,label
0,0,40.498896,-74.241534,3,107356.630775,1
1,1,40.499017,-74.240599,1,107356.630775,1
2,2,40.499315,-74.239739,3,107356.630775,1
3,3,40.499393,-74.242174,1,107356.630775,1
4,4,40.500554,-74.243048,2,107356.630775,1
...,...,...,...,...,...,...
65743,91736,40.680421,-73.936317,0,54524.955062,0
65744,91739,40.636328,-74.046730,0,80281.162008,0
65745,91741,40.840248,-73.956960,0,56111.696084,0
65746,91742,40.617088,-73.924005,0,122972.429205,0


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [16]:

# Select features and target
X = data[['2014-2018','Latitude','Longitude']]  # Example features
y = data['label']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model_rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = model_rf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.73191381495564
              precision    recall  f1-score   support

           0       0.48      0.33      0.39      5175
           1       0.79      0.87      0.83     14550

    accuracy                           0.73     19725
   macro avg       0.64      0.60      0.61     19725
weighted avg       0.71      0.73      0.71     19725



In [32]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize KNN classifier
knn = KNeighborsClassifier(n_neighbors=9)  # Start with k=5

# Train the model
knn.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_knn = knn.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

Accuracy: 0.7734347275031686
              precision    recall  f1-score   support

           0       0.68      0.26      0.38      5175
           1       0.78      0.96      0.86     14550

    accuracy                           0.77     19725
   macro avg       0.73      0.61      0.62     19725
weighted avg       0.76      0.77      0.73     19725



In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Logistic Regression
logreg = LogisticRegression(solver='liblinear', random_state=42)  # Using liblinear as solver for binary classification

# Train the model
logreg.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_logreg = logreg.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

Accuracy: 0.7376425855513308
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5175
           1       0.74      1.00      0.85     14550

    accuracy                           0.74     19725
   macro avg       0.37      0.50      0.42     19725
weighted avg       0.54      0.74      0.63     19725



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), n_estimators=100, seed=42, learning_rate=0.1)
from sklearn.preprocessing import LabelEncoder

# Encoding target variable if it's not already in the proper format
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Then train your model with the encoded labels
xgb_clf.fit(X_train, y_train_encoded)
y_pred_xgb = xgb_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_xgb)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred_xgb))

Accuracy: 0.7783016476552598
              precision    recall  f1-score   support

           0       0.80      0.20      0.33      5175
           1       0.78      0.98      0.87     14550

    accuracy                           0.78     19725
   macro avg       0.79      0.59      0.60     19725
weighted avg       0.78      0.78      0.73     19725

