In [3]:
import pandas as pd
import numpy
import requests
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sodapy import Socrata
import warnings
warnings.filterwarnings('ignore')



## API Setup

In [4]:
crashes = pd.read_csv('../data/crashes_crashes.csv')

conn = Socrata("data.cityofchicago.org", None)

results = conn.get("85ca-t3if", limit=2000, where = "crash_date > '2024-01-19T02:02:00.000'")

# Convert to pandas DataFrame
api_df = pd.DataFrame.from_records(results)

api_df.columns = api_df.columns.str.upper()

api_df.head()



Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,...,HIT_AND_RUN_I,INTERSECTION_RELATED_I,PRIVATE_PROPERTY_I,CRASH_DATE_EST_I,PHOTOS_TAKEN_I,STATEMENTS_TAKEN_I,WORK_ZONE_I,WORK_ZONE_TYPE,WORKERS_PRESENT_I,DOORING_I
0,a27db268a57869121993d68603580a669ddb3966076a53...,2024-01-26T01:14:00.000,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,RAIN,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,ONE-WAY,STRAIGHT AND LEVEL,...,,,,,,,,,,
1,c20a8017cf9cf05492216aba01ba822ea7c62b7ecf21b0...,2024-01-26T01:10:00.000,30,NO CONTROLS,NO CONTROLS,RAIN,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,ONE-WAY,STRAIGHT AND LEVEL,...,,,,,,,,,,
2,22d43d89a98a8453861f97410961b73278d3a2ad4a652f...,2024-01-26T00:45:00.000,30,NO CONTROLS,NO CONTROLS,RAIN,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,OTHER,STRAIGHT AND LEVEL,...,,,,,,,,,,
3,48960d0da7aca84a9a8d199c8f3b5b3300d5cf4b1defd0...,2024-01-25T23:32:00.000,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,RAIN,"DARKNESS, LIGHTED ROAD",PEDESTRIAN,FOUR WAY,STRAIGHT AND LEVEL,...,Y,,,,,,,,,
4,410bb8db543aec2a7a8f10d66c3f03540352b07b75efcd...,2024-01-25T22:20:00.000,25,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,RAIN,DARKNESS,ANGLE,FOUR WAY,STRAIGHT AND LEVEL,...,,Y,,,,,,,,


In [5]:
api_col_num = len(api_df.columns)
dataset_col_num = len(crashes.columns)
shared_cols = list(set(api_df.columns).intersection(set(crashes.columns)))

print(f'api_columns: {api_col_num}, dataset_columns: {dataset_col_num}')
shared_cols

api_columns: 47, dataset_columns: 48


['LONGITUDE',
 'LOCATION',
 'ROAD_DEFECT',
 'INJURIES_TOTAL',
 'CRASH_RECORD_ID',
 'HIT_AND_RUN_I',
 'CRASH_DATE_EST_I',
 'STREET_DIRECTION',
 'DOORING_I',
 'WORK_ZONE_I',
 'INTERSECTION_RELATED_I',
 'CRASH_TYPE',
 'STREET_NAME',
 'PRIM_CONTRIBUTORY_CAUSE',
 'STATEMENTS_TAKEN_I',
 'WORK_ZONE_TYPE',
 'TRAFFIC_CONTROL_DEVICE',
 'INJURIES_FATAL',
 'CRASH_MONTH',
 'NUM_UNITS',
 'POSTED_SPEED_LIMIT',
 'TRAFFICWAY_TYPE',
 'CRASH_DAY_OF_WEEK',
 'DEVICE_CONDITION',
 'STREET_NO',
 'INJURIES_REPORTED_NOT_EVIDENT',
 'ROADWAY_SURFACE_COND',
 'FIRST_CRASH_TYPE',
 'WEATHER_CONDITION',
 'BEAT_OF_OCCURRENCE',
 'REPORT_TYPE',
 'CRASH_HOUR',
 'INJURIES_NON_INCAPACITATING',
 'INJURIES_UNKNOWN',
 'SEC_CONTRIBUTORY_CAUSE',
 'MOST_SEVERE_INJURY',
 'WORKERS_PRESENT_I',
 'ALIGNMENT',
 'INJURIES_INCAPACITATING',
 'DAMAGE',
 'PHOTOS_TAKEN_I',
 'LIGHTING_CONDITION',
 'LATITUDE',
 'DATE_POLICE_NOTIFIED',
 'INJURIES_NO_INDICATION',
 'CRASH_DATE']

In [8]:
api_cols = set(api_df.columns)
dataset_cols = set(crashes.columns)

print(f"""
        In API, not in dataset: {api_cols - dataset_cols}
        In dataset, not in API: {dataset_cols - api_cols}
       """)


        In API, not in dataset: {'PRIVATE_PROPERTY_I'}
        In dataset, not in API: {'NOT_RIGHT_OF_WAY_I', 'LANE_CNT'}
    


In [6]:
crashes.columns

Index(['CRASH_RECORD_ID', 'CRASH_DATE_EST_I', 'CRASH_DATE',
       'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND',
       'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
       'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',
       'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I',
       'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LATITUDE', 

In [5]:
 api_df = api_df.loc[:, shared_cols]

crashes_prime_cause = api_df[(api_df['PRIM_CONTRIBUTORY_CAUSE'] != 'UNABLE TO DETERMINE')\
                              & (api_df['PRIM_CONTRIBUTORY_CAUSE'] != 'NOT APPLICABLE')]

breaking_laws_list = ['DISREGARDING TRAFFIC SIGNALS', 'DISREGARDING STOP SIGN', 'DISREGARDING ROAD MARKINGS', 
                  'DISREGARDING OTHER TRAFFIC SIGNS', 'DISREGARDING YIELD SIGN', 'FAILING TO YIELD RIGHT-OF-WAY'] 

bad_driving_list = ['DRIVING ON WRONG SIDE/WRONG WAY', 'FOLLOWING TOO CLOSELY', 'IMPROPER OVERTAKING/PASSING', 
                    'FAILING TO REDUCE SPEED TO AVOID CRASH', 'TURNING RIGHT ON RED','EXCEEDING SAFE SPEED FOR CONDITIONS',
                    'EXCEEDING AUTHORIZED SPEED LIMIT', 'IMPROPER LANE USAGE', 'PHYSICAL CONDITION OF DRIVER', 
                  'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE','IMPROPER BACKING', 'IMPROPER TURNING/NO SIGNAL']

distraction_list = ['TEXTING', 'DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)', 
                    'DISTRACTION - FROM INSIDE VEHICLE','CELL PHONE USE OTHER THAN TEXTING']

drinking_list = ['OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER', 
                 'HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)', 'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)']

road_list = ['DISTRACTION - FROM OUTSIDE VEHICLE', 'ROAD ENGINEERING/SURFACE/MARKING DEFECTS', 'ROAD CONSTRUCTION/MAINTENANCE', 'EQUIPMENT - VEHICLE CONDITION', 
             'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)', 'WEATHER']

other_list= ['PASSING STOPPED SCHOOL BUS', 'OBSTRUCTED CROSSWALKS', 'BICYCLE ADVANCING LEGALLY ON RED LIGHT', 
             'MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT', 'EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST', 'ANIMAL', 'TURNING RIGHT ON RED', 
             'RELATED TO BUS STOP'] 

binning_list = [breaking_laws_list, bad_driving_list, distraction_list, drinking_list, road_list, other_list]
value_list = ['BREAKING LAW', 'BAD DRIVING', 'DISTRACTION INSIDE VEHICLE', 'DRINKING/DRUGS', 'OUTSIDE FACTORS', 'OTHER']



for group, value in zip(binning_list, value_list):
    crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'] = crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'].replace(to_replace = group, value = value)

crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'].value_counts()

PRIM_CONTRIBUTORY_CAUSE
BAD DRIVING                   589
BREAKING LAW                  273
OUTSIDE FACTORS               167
DRINKING/DRUGS                 22
DISTRACTION INSIDE VEHICLE     10
OTHER                           8
Name: count, dtype: int64

In [2]:
crashes_prime_cause_filled = crashes_prime_cause.fillna({'INTERSECTION_RELATED_I': 'N', 'NOT_RIGHT_OF_WAY_I': 
                                                         'N', 'HIT_AND_RUN_I':'N'})

crashes_prime_cause_filled = crashes_prime_cause_filled.dropna(subset=['LATITUDE', 'LONGITUDE', 'INJURIES_TOTAL', 'INJURIES_FATAL', 
                                                                           'MOST_SEVERE_INJURY'])

NameError: name 'crashes_prime_cause' is not defined

In [1]:
n_clusters = 30  # Number of clusters to create
X = crashes_prime_cause_filled[['LONGITUDE', 'LATITUDE']]

# Create a K-Means clustering model
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X)

# Add cluster labels to your data
cluster_labels = kmeans.labels_
crashes_prime_cause_filled['GEO_KMEANS_Cluster'] = cluster_labels

NameError: name 'crashes_prime_cause_filled' is not defined

In [None]:
crashes_prime_cause_filled = crashes_prime_cause_filled.drop(columns = ['CRASH_DATE_EST_I', 'DEVICE_CONDITION', 'REPORT_TYPE', 'DATE_POLICE_NOTIFIED',
                                                          'STREET_NO', 'STREET_DIRECTION', 'STREET_NAME','BEAT_OF_OCCURRENCE', 'PHOTOS_TAKEN_I', 
                                                          'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 
                                                          'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_NO_INDICATION', 
                                                          'INJURIES_UNKNOWN', 'LOCATION', 'LANE_CNT', 'CRASH_DATE', 
                                                          'INJURIES_REPORTED_NOT_EVIDENT', 'TRAFFIC_CONTROL_DEVICE', 'INJURIES_TOTAL', 
                                                          'INJURIES_FATAL'])

In [None]:
crashes_cleaned = pd.get_dummies(crashes_prime_cause_filled, columns = ['WEATHER_CONDITION', 'LIGHTING_CONDITION','FIRST_CRASH_TYPE',
                                                                            'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 
                                                                            'SEC_CONTRIBUTORY_CAUSE', 'ROAD_DEFECT','MOST_SEVERE_INJURY'],
                                                                             drop_first=True, dtype=int)

le = LabelEncoder()

for col in ['CRASH_TYPE','INTERSECTION_RELATED_I', 'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE']:
    crashes_cleaned[col] = le.fit_transform(crashes_cleaned[col])

In [None]:
# crashes_cleaned.to_csv('../data/latest_data.csv')