In [89]:
#Install dependencies

!pip install -U scikit-learn

import numpy as np
import pandas as pd
import collections
from sklearn.cluster import KMeans

Looking in indexes: https://artifactory.lyft.net/artifactory/api/pypi/virtual-pypi-lyft/simple/


In [90]:
# Read data

df = pd.read_csv("US_Accidents_Dec21_updated.csv")
df.head(3)

Unnamed: 0,ID,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,3,2016-02-08 00:37:08,2016-02-08 06:37:08,40.10891,-83.09286,40.11206,-83.03187,3.23,Between Sawmill Rd/Exit 20 and OH-315/Olentang...,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,2,2016-02-08 05:56:20,2016-02-08 11:56:20,39.86542,-84.0628,39.86501,-84.04873,0.747,At OH-4/OH-235/Exit 41 - Accident.,...,False,False,False,False,False,False,Night,Night,Night,Night
2,A-3,2,2016-02-08 06:15:39,2016-02-08 12:15:39,39.10266,-84.52468,39.10209,-84.52396,0.055,At I-71/US-50/Exit 1 - Accident.,...,False,False,False,False,False,False,Night,Night,Night,Day


In [91]:
# Clean data

print("Shape before dropping na: ", df.shape)
df.dropna(inplace = True)
print("Shape after dropping na: ", df.shape)

print(df.columns)

df = df.drop(columns=[
    "ID", 
    "End_Time", 
    "Description", 
    "Start_Lat", 
    "Start_Lng", 
    "End_Lat", 
    "End_Lng", 
    "Distance(mi)", 
    "Number", 
    "Street",
    "State",
    "City",
    "County",
    "Country",
    "Timezone",
    "Airport_Code",
    "Weather_Timestamp",
    "Start_Time",
])

Shape before dropping na:  (2845342, 47)
Shape after dropping na:  (943318, 47)
Index(['ID', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
       'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Number', 'Street',
       'Side', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')


In [92]:
# Transform data into categorical

categorical_cols = ['Side', 'Zipcode', 'Weather_Condition', 'Wind_Direction', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
for col in categorical_cols:
    df[col] = df[col].astype('category')
    
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

df.dtypes

Severity                   int64
Side                        int8
Zipcode                    int32
Temperature(F)           float64
Wind_Chill(F)            float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction              int8
Wind_Speed(mph)          float64
Precipitation(in)        float64
Weather_Condition           int8
Amenity                     bool
Bump                        bool
Crossing                    bool
Give_Way                    bool
Junction                    bool
No_Exit                     bool
Railway                     bool
Roundabout                  bool
Station                     bool
Stop                        bool
Traffic_Calming             bool
Traffic_Signal              bool
Turning_Loop                bool
Sunrise_Sunset              int8
Civil_Twilight              int8
Nautical_Twilight           int8
Astronomical_Twilight       int8
dtype: object

In [93]:
# Split data into training and testing

msk = np.random.rand(len(df)) < 0.8
train_data = df[msk]
test_data = df[~msk]

train_data.drop(columns=["Severity"])
test_data_actuals = test_data[["Severity"]].copy().to_numpy()
test_data.drop(columns=["Severity"])

Unnamed: 0,Side,Zipcode,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
90,0,54323,16.0,5.3,59.0,30.22,10.0,22,8.1,0.0,...,False,False,False,False,False,False,1,0,0,0
461,1,164134,33.1,21.2,85.0,29.64,7.0,20,20.7,0.0,...,False,False,False,False,False,False,1,1,1,0
476,0,160169,37.0,28.5,79.0,29.66,5.0,22,13.8,0.0,...,False,False,False,False,False,False,0,0,0,0
3092,1,263242,71.0,71.0,53.0,29.75,10.0,9,16.0,0.0,...,False,False,False,False,False,False,0,0,0,0
8541,1,244812,74.0,74.0,53.0,29.03,10.0,12,13.0,0.0,...,False,False,False,False,True,False,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2845194,0,304945,73.0,73.0,29.0,25.27,10.0,17,5.0,0.0,...,False,False,False,False,False,False,0,0,0,0
2845230,1,300096,78.0,78.0,42.0,29.73,10.0,6,8.0,0.0,...,False,False,False,False,False,False,0,0,0,0
2845240,1,302882,82.0,82.0,33.0,29.50,10.0,5,10.0,0.0,...,False,False,False,False,False,False,0,0,0,0
2845252,0,305226,86.0,86.0,14.0,26.36,10.0,5,6.0,0.0,...,False,False,False,False,True,False,0,0,0,0


In [94]:
# Apply KMeans

kmeans = KMeans(init="k-means++", n_clusters=4, n_init=10, random_state=0).fit(train_data)
kmeans.cluster_centers_

array([[2.06034432e+00, 5.52093459e-01, 2.06088320e+05, 6.10551256e+01,
        5.94786280e+01, 6.09219184e+01, 2.88571825e+01, 9.17095198e+00,
        9.47539966e+00, 7.54942086e+00, 2.98805411e-03, 2.15011946e+01,
        2.24453944e-02, 1.55179481e-03, 1.34672366e-01, 2.62341161e-03,
        4.70808690e-03, 4.47385372e-03, 1.51900217e-02, 5.27024653e-05,
        3.57674065e-02, 4.59975406e-02, 1.72746970e-03, 1.72875798e-01,
        0.00000000e+00, 3.85811325e-01, 3.43461966e-01, 2.93330210e-01,
        2.52936698e-01],
       [2.15509503e+00, 5.76168629e-01, 4.19436713e+04, 5.82297005e+01,
        5.69154336e+01, 6.60594072e+01, 2.95653119e+01, 9.18545609e+00,
        9.41837186e+00, 6.56553471e+00, 4.43123315e-03, 2.33391124e+01,
        2.75216793e-02, 2.80134714e-04, 1.13529262e-01, 4.65023625e-03,
        6.75435921e-03, 1.54385353e-03, 9.61795850e-03, 1.24504317e-05,
        2.34939647e-02, 3.50915418e-02, 7.28350256e-04, 1.63741853e-01,
        0.00000000e+00, 3.69939678e-01,

In [95]:
# Predict on test data

test_data_predictions = kmeans.predict(test_data)

In [99]:
# Analyze results

preds = list(test_data_predictions)
actuals = list(test_data_actuals)

results = collections.defaultdict(collections.Counter)
for i in range(len(actuals)):
    results[preds[i]][actuals[i][0]] += 1
results

defaultdict(collections.Counter,
            {1: Counter({2: 35849, 4: 2901, 3: 843, 1: 275}),
             2: Counter({4: 1064, 2: 56262, 3: 326, 1: 465}),
             3: Counter({2: 46140, 3: 355, 4: 762, 1: 419}),
             0: Counter({4: 1137, 3: 1226, 2: 39175, 1: 980})})

In [101]:
# Compute f1 score

# Match cluster 2 to Sev 2
sev_2_f1_score = 56262 / (56262 + 0.5 * (1064 + 326 + 465))

# Match cluster 1 to Sev 4
sev_4_f1_score = 2901 / (2901 + 0.5 * (35849 + 843 + 275))

# Match cluster 0 to Sev 3
sev_3_f1_score = 1226 / (1226 + 0.5 * (1137 + 39175 + 980))

# Match cluster 3 to Sev 1
sev_1_f1_score = 419 / (419 + 0.5 * (46140 + 355 + 762))

(sev_1_f1_score + sev_2_f1_score + sev_3_f1_score + sev_4_f1_score) / 4

0.2982295608872393