In [1]:
import pandas as pd
import xgboost as xgb

from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, cohen_kappa_score


  import pandas.util.testing as tm


In [2]:
#read saved dataframe from memory from previous notebook
%store -r crashes

In [3]:
crashes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 309013 entries, 3 to 416197
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype          
---  ------                         --------------   -----          
 0   CRASH_RECORD_ID                309013 non-null  object         
 1   RD_NO                          309013 non-null  object         
 2   CRASH_DATE                     309013 non-null  datetime64[ns] 
 3   POSTED_SPEED_LIMIT             309013 non-null  float64        
 4   TRAFFIC_CONTROL_DEVICE         309013 non-null  object         
 5   DEVICE_CONDITION               309013 non-null  object         
 6   WEATHER_CONDITION              309013 non-null  object         
 7   LIGHTING_CONDITION             309013 non-null  object         
 8   FIRST_CRASH_TYPE               309013 non-null  object         
 9   TRAFFICWAY_TYPE                309013 non-null  object         
 10  LANE_CNT                       151584 non-null  float64 

### Hopefully a refined attempt to predict some of our missing values
We were able to pull in our modified dataset successfully so we will now trim it down a bit more in hopes that we can predict some of our missing lane values!

In [4]:
model_data = crashes[crashes['LANE_TYPE'].notna()]
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151584 entries, 3 to 416126
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype          
---  ------                         --------------   -----          
 0   CRASH_RECORD_ID                151584 non-null  object         
 1   RD_NO                          151584 non-null  object         
 2   CRASH_DATE                     151584 non-null  datetime64[ns] 
 3   POSTED_SPEED_LIMIT             151584 non-null  float64        
 4   TRAFFIC_CONTROL_DEVICE         151584 non-null  object         
 5   DEVICE_CONDITION               151584 non-null  object         
 6   WEATHER_CONDITION              151584 non-null  object         
 7   LIGHTING_CONDITION             151584 non-null  object         
 8   FIRST_CRASH_TYPE               151584 non-null  object         
 9   TRAFFICWAY_TYPE                151584 non-null  object         
 10  LANE_CNT                       151584 non-null  float64 

### Great!
Next, we'll set our y and pick some features to try for X.

In [5]:
#To reduce the number of classes to predict, we'll use LANE_TYPE as our labels
y = model_data['LANE_TYPE']
#Here we'll try some of the same features we used last time to see if there's any improvement with a simplified list of labels
features = ['LATITUDE', 'LONGITUDE', 'STREET_TYPE', 'TRAFFIC_CONTROL_PRESENT', 'STRAIGHT_ROAD', 'LEVEL_ROAD', 'POSTED_SPEED_LIMIT']
X = model_data[features]

In [6]:
X_dummies = pd.get_dummies(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, test_size=0.25)

In [8]:
KNN = KNeighborsClassifier()

In [9]:
KNN.fit(X_train, y_train)
test_preds = KNN.predict(X_test)

In [10]:
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds, average='macro')))
    print("Recall Score: {}".format(recall_score(labels, preds, average='macro')))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average='macro')))
    
print_metrics(y_test, test_preds)

Precision Score: 0.4860494505395475
Recall Score: 0.455531037744356
Accuracy Score: 0.6767204982056154
F1 Score: 0.4658982382707024


In [19]:
def find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1):
        knn = KNeighborsClassifier(n_neighbors=k, n_jobs=3)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        acc = f1_score(y_test, preds, average='macro')
        if acc > best_score:
            best_k = k
            best_score = acc
    
    print("Best Value for k: {}".format(best_k))
    print("Accuracy-Score: {}".format(best_score))

In [20]:
find_best_k(X_train, y_train, X_test, y_test)

Best Value for k: 6
Accuracy-Score: 0.44068930586425037


In [21]:
KNN = KNeighborsClassifier(n_neighbors=6)
KNN.fit(X_train, y_train)
test_preds = KNN.predict(X_test)

In [22]:
print_metrics(y_test, test_preds)

Precision Score: 0.45806564346567874
Recall Score: 0.4371671081814327
Accuracy Score: 0.6652681021743719
F1 Score: 0.44068930586425037


### Let's Try Something Else
KNN doesn't seem to be the best fit here so we'll attempt some other methods just to see if we get any improvement and if this is worthwhile.

In [26]:
boost = xgb.XGBClassifier()
boost.fit(X_train, y_train)

XGBClassifier(objective='multi:softprob')

In [28]:
training_preds = boost.predict(X_train)
val_preds = boost.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print(training_accuracy, val_accuracy)

0.6580465836323974 0.6557156428119063
