**Accident Severity Analysis**

In [1]:
import numpy as np  
import pandas as pd
import datetime
import os
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MaxAbsScaler

**Data Import**

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print(filepath)

/kaggle/input/us-accidents/US_Accidents_June20.csv


In [3]:
data = pd.read_csv(filepath)
data.columns

Index(['ID', 'Source', 'TMC', 'Severity', 'Start_Time', 'End_Time',
       'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)',
       'Description', 'Number', 'Street', 'Side', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [4]:
np.unique(data['Severity'])

array([1, 2, 3, 4])

**Feature Extraction**

In [5]:
X = pd.DataFrame()

In [6]:
def str_to_datetime(x):
    x = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    return x

def get_acc_len(x):
    x = x.total_seconds() / 60.0
    return x        

In [7]:
##### Feature extraction and clean-up ######

#start time, end time --> length of accident
end_time = data['End_Time'].apply(str_to_datetime)
start_time = data['Start_Time'].apply(str_to_datetime)
data['Accident_Length'] = end_time - start_time 
X['Accident_Length'] = data['Accident_Length'].apply(get_acc_len)
X['Accident_Length']


X['Accident_Distance'] = data['Distance(mi)']
X['Accident_Distance'].fillna(-500.0, inplace = True) 

X['Temp'] = data['Temperature(F)']
X['Temp'].fillna(-500.0, inplace = True) 
X['Temp']

print(X['Temp'].mean())
print(X['Temp'].max())
print(X['Temp'].min())

X['Humidity'] = data['Humidity(%)']
X['Humidity'].fillna(-10.0, inplace = True) 
X['Humidity']

print(X['Humidity'].mean())
print(X['Humidity'].max())
print(X['Humidity'].min())

X['Precipitation'] = data['Precipitation(in)']
X['Precipitation'].fillna(-10.0, inplace = True) 
X['Precipitation']

print(X['Precipitation'].mean())
print(X['Precipitation'].max())
print(X['Precipitation'].min())

X['Wind_Chill'] = data['Wind_Chill(F)']
X['Wind_Chill'].fillna(-500.0, inplace = True) 
X['Wind_Chill']

print(X['Wind_Chill'].mean())
print(X['Wind_Chill'].max())
print(X['Wind_Chill'].min())

X['Pressure'] = data['Pressure(in)']
X['Pressure'].fillna(-500.0, inplace = True) 
X['Pressure']

print(X['Pressure'].mean())
print(X['Pressure'].max())
print(X['Pressure'].min())

X['Visibility'] = data['Visibility(mi)']
X['Visibility'].fillna(-500.0, inplace = True) 
X['Visibility']

print(X['Visibility'].mean())
print(X['Visibility'].max())
print(X['Visibility'].min())

X['Severity'] = data['Severity']

51.42255624332331
170.6
-500.0
63.624498059976375
100.0
-10.0
-5.759011884333611
25.0
-10.0
-240.77838879416709
115.0
-500.0
21.319353845340995
57.74
-500.0
-1.8688801995208821
140.0
-500.0


In [8]:
X.isna().any()

Accident_Length      False
Accident_Distance    False
Temp                 False
Humidity             False
Precipitation        False
Wind_Chill           False
Pressure             False
Visibility           False
Severity             False
dtype: bool

In [9]:
unique, counts = np.unique(X['Severity'].values, return_counts=True)
dict(zip(unique, counts))

{1: 29174, 2: 2373210, 3: 998913, 4: 112320}

In [10]:
#
X = X[X['Severity'] < 4] 
X = X[X['Severity'] > 1]
X.shape

(3372123, 9)

In [11]:
unique, counts = np.unique(X['Severity'].values, return_counts=True)
dict(zip(unique, counts))

{2: 2373210, 3: 998913}

In [12]:
### Normalization #####
transformer = MaxAbsScaler().fit(X.iloc[:,:8].values)
X_norm = transformer.transform(X.iloc[:,:8])
X_norm

array([[ 2.20822665e-04,  2.99733233e-05,  7.38000000e-02, ...,
        -1.00000000e+00,  5.93600000e-02,  2.00000000e-02],
       [ 2.10977069e-05,  2.99733233e-05,  7.58000000e-02, ...,
        -1.00000000e+00,  5.93000000e-02,  2.00000000e-02],
       [ 2.10977069e-05,  2.99733233e-05,  7.20000000e-02, ...,
         6.66000000e-02,  5.93400000e-02,  2.00000000e-02],
       ...,
       [ 2.00193797e-05,  1.68150344e-03,  1.46000000e-01, ...,
         1.46000000e-01,  5.94800000e-02,  2.00000000e-02],
       [ 2.06405899e-05,  2.31394056e-03,  1.42000000e-01, ...,
         1.42000000e-01,  5.92400000e-02,  2.00000000e-02],
       [ 2.06874737e-05,  1.60956746e-03,  1.58000000e-01, ...,
         1.58000000e-01,  5.72600000e-02,  1.40000000e-02]])

In [13]:
print(X_norm.shape)
print(X['Severity'].values.reshape(len(X['Severity']),1).shape)
np.concatenate((X_norm, X['Severity'].values.reshape(len(X['Severity']),1)), axis=1)

(3372123, 8)
(3372123, 1)


array([[2.20822665e-04, 2.99733233e-05, 7.38000000e-02, ...,
        5.93600000e-02, 2.00000000e-02, 3.00000000e+00],
       [2.10977069e-05, 2.99733233e-05, 7.58000000e-02, ...,
        5.93000000e-02, 2.00000000e-02, 2.00000000e+00],
       [2.10977069e-05, 2.99733233e-05, 7.20000000e-02, ...,
        5.93400000e-02, 2.00000000e-02, 2.00000000e+00],
       ...,
       [2.00193797e-05, 1.68150344e-03, 1.46000000e-01, ...,
        5.94800000e-02, 2.00000000e-02, 2.00000000e+00],
       [2.06405899e-05, 2.31394056e-03, 1.42000000e-01, ...,
        5.92400000e-02, 2.00000000e-02, 2.00000000e+00],
       [2.06874737e-05, 1.60956746e-03, 1.58000000e-01, ...,
        5.72600000e-02, 1.40000000e-02, 2.00000000e+00]])

In [14]:
#x values normalized --> X_norm
#y values --> X['Severity']
import pandas as pd
all_data = pd.DataFrame(np.concatenate((X_norm, X['Severity'].values.reshape(len(X['Severity']),1)), axis=1), columns = X.columns)
#all_data

In [15]:
###shuffle data###
all_data.sample(frac=1)

Unnamed: 0,Accident_Length,Accident_Distance,Temp,Humidity,Precipitation,Wind_Chill,Pressure,Visibility,Severity
2499490,0.000253,0.00018,0.1648,0.84,-0.4,-1.000,0.06000,0.010,2.0
959224,0.000065,0.00000,0.1920,0.44,0.0,-1.000,0.05986,0.020,2.0
2256157,0.000021,0.00000,0.1202,0.62,-0.4,-1.000,0.06014,0.020,2.0
1535523,0.000031,0.00000,0.1202,0.80,-0.4,-1.000,0.06002,0.020,2.0
2124945,0.000031,0.00000,0.1540,0.17,-0.4,-1.000,0.06030,0.020,3.0
...,...,...,...,...,...,...,...,...,...
466524,0.000033,0.00000,0.1518,1.00,-0.4,-1.000,0.06002,0.020,2.0
1714128,0.000021,0.00000,0.1422,0.90,-0.4,-1.000,0.06004,0.020,2.0
869025,0.000042,0.00000,0.0580,0.67,0.0,0.044,0.05900,0.020,3.0
1829492,0.000021,0.00000,0.1580,0.77,0.0,-1.000,0.05956,0.016,2.0


In [16]:
####
print(len(all_data))
print(len(all_data))
train_frac = int(len(all_data) * 0.9)
print(train_frac)

3372123
3372123
3034910


In [22]:
import torch
from torch.utils import data

class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, x, labels):
        'Initialization'
        self.labels = labels
        self.x = x
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        #ID = self.x[index]

        # Load data and get label
        
        curr_x = self.x[index]

        y = self.labels[index]
        out_y = np.zeros(2)
        out_y[int(y)] = 1    

        return curr_x, out_y

In [31]:
### Dataloader ###
training_set = Dataset(all_data.iloc[:train_frac,:8].values, all_data.iloc[:train_frac, 8].values - 2)
training_generator = data.DataLoader(training_set, batch_size=20000, shuffle=True)

test_set = Dataset(all_data.iloc[train_frac:,:8].values, all_data.iloc[train_frac:, 8].values - 2)
test_generator = data.DataLoader(test_set, batch_size=len(all_data[train_frac:]), shuffle=False)

In [32]:

class Accident_Model(nn.Module):
    def __init__(self):
        super(Accident_Model, self).__init__()
        self.w1 = nn.Linear(8,8,bias=True)
        nn.init.xavier_uniform_(self.w1.weight)
        nn.init.zeros_(self.w1.bias)
        self.bn1 = torch.nn.BatchNorm1d(8)
        
        
        self.w2 = nn.Linear(8,4,bias=True)
        nn.init.xavier_uniform_(self.w2.weight)
        nn.init.zeros_(self.w2.bias)
        self.bn2 = torch.nn.BatchNorm1d(4)
        
        
        self.w3 = nn.Linear(4,2,bias=True)
        nn.init.xavier_uniform_(self.w3.weight)
        nn.init.zeros_(self.w3.bias)
        
    
    
    def forward(self, x):
        #print(x)
        x = x.float()
        x = self.bn1(F.relu(self.w1(x)))
        #x = self.w2(x)
        
        x = self.bn2(F.relu(self.w2(x)))
        x = self.w3(x)
        x_layer_pred.append(x)
        '''
        x = self.w1(x.float())
        x = F.relu(self.w1(x.float()))
        x = self.w2(x)
        
        
        '''
        return x



In [33]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

class_weights = np.asarray([0.8,1.2])

def sigmoid(x):
    return np.divide(1.0, np.add(1.0, np.exp(-1.0*x)))

In [34]:
net = Accident_Model()

optimizer = torch.optim.Adam(net.parameters(), lr=0.0001,weight_decay=0.95)
criterion = nn.BCEWithLogitsLoss(weight=torch.Tensor(class_weights))

for epoch in np.arange(5):
    x_layer_pred = []
    for batch_idx,(batch, labels) in enumerate(training_generator):

        optimizer.zero_grad()
        outputs = net(batch).float()

        loss = criterion(outputs,np.squeeze(labels))
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            for test_batch, test_labels in test_generator:

                test_outputs = net(test_batch)
                #test_outputs_ = F.sigmoid(test_outputs.detach())
                test_loss = criterion(test_outputs.float(),np.squeeze(test_labels))
                
                print("Step:",epoch)
                print("Train Loss: ", loss.item())
                print("Test Loss: ", test_loss.item())
                
                #print("Error train:",mean_absolute_error(outputs.detach().max(1).values,labels.detach().argmax(1)))
                #print("Error test:",mean_absolute_error(test_outputs.detach().max(1).values,test_labels.detach().argmax(1)))
                
                print(confusion_matrix(sigmoid(outputs.detach()).argmax(1), labels.detach().argmax(1)))
                print(precision_recall_fscore_support(labels.detach().argmax(1), sigmoid(outputs.detach()).argmax(1), average=None))
                print("\n")
                print(confusion_matrix(sigmoid(test_outputs.detach()).argmax(1), test_labels.detach().argmax(1)))
                print(precision_recall_fscore_support(test_labels.detach().argmax(1), sigmoid(test_outputs.detach()).argmax(1), average=None))
                
                
                accuracy = accuracy_score(labels.detach().argmax(1), sigmoid(outputs.detach()).argmax(1))
                print("Accuracy Train: ", accuracy)
                accuracy = accuracy_score(test_labels.detach().argmax(1), sigmoid(test_outputs.detach()).argmax(1))
                print("Accuracy Test: ", accuracy)
                break;


Step: 0
Train Loss:  0.8205369296149365
Test Loss:  0.8243731295144832
[[7832 3121]
 [5994 3053]]
(array([0.71505524, 0.33745993]), array([0.56646897, 0.49449304]), array([0.63214819, 0.4011563 ]), array([13826,  6174]))


[[148444  34286]
 [120399  34084]]
(array([0.81236797, 0.22063269]), array([0.55215869, 0.49852274]), array([0.65745295, 0.30588774]), array([268843,  68370]))
Accuracy Train:  0.54425
Accuracy Test:  0.5412839955754968
Step: 0
Train Loss:  0.7973486997977018
Test Loss:  0.7976784298342782
[[8445 3507]
 [5339 2709]]
(array([0.70657631, 0.33660537]), array([0.61266686, 0.43581081]), array([0.65627914, 0.37983735]), array([13784,  6216]))


[[160799  37539]
 [108044  30831]]
(array([0.81073218, 0.2220054 ]), array([0.59811488, 0.4509434 ]), array([0.68837988, 0.29753191]), array([268843,  68370]))
Accuracy Train:  0.5577
Accuracy Test:  0.5682758375270229
Step: 1
Train Loss:  0.7904413044520825
Test Loss:  0.7871426690583545
[[8628 3560]
 [5188 2624]]
(array([0.7079094

In [35]:
len(x_layer_pred)

154

In [36]:
train_pred = pd.DataFrame(net(torch.Tensor(all_data.iloc[:train_frac,:8].values)).detach().numpy())
test_pred = pd.DataFrame(net(torch.Tensor(all_data.iloc[train_frac:,:8].values)).detach().numpy())
train_pred

Unnamed: 0,0,1
0,0.796704,-0.054295
1,0.822240,0.308871
2,-1.598832,-0.856095
3,-1.523629,-0.917408
4,-1.352438,-1.026290
...,...,...
3034905,-0.363081,-0.960096
3034906,-0.363081,-0.960096
3034907,-0.126688,-0.928808
3034908,-0.628960,-0.995287


In [37]:
#train_pred.loc[np.where(all_data.iloc[:train_frac,8].values == 3)[0]]
len(x_layer_pred)

156

In [38]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42,sampling_strategy='minority')
X_res, y_res = sm.fit_resample(x_layer_pred[154].detach().numpy(), all_data.iloc[:train_frac,8].values - 2)
X_res_test, y_res_test = sm.fit_resample(x_layer_pred[155].detach().numpy(), all_data.iloc[train_frac:,8].values - 2)

Using TensorFlow backend.


In [None]:
from xgboost import XGBClassifier

param_dist = {'n_estimators':150, 'max_depth':4}

xgboost_model = XGBClassifier(**param_dist)
xgboost_model.fit(X_res, y_res, verbose=False)



In [40]:
from sklearn.metrics import precision_recall_fscore_support

x_train_xgboost = xgboost_model.predict(x_layer_pred[154].detach().numpy())
print("Mean Absolute Error Train: " + str(mean_absolute_error(x_train_xgboost, all_data.iloc[:train_frac,8].values - 2)))
accuracy = accuracy_score(all_data.iloc[:train_frac,8].values - 2, np.round(x_train_xgboost))
print("Accuracy: ", accuracy)
print(precision_recall_fscore_support(all_data.iloc[:train_frac,8].values - 2, np.round(x_train_xgboost)))


# make predictions
x_test_xgboost = xgboost_model.predict(x_layer_pred[155].detach().numpy())
print("Mean Absolute Error Test : " + str(mean_absolute_error(x_test_xgboost, all_data.iloc[train_frac:,8].values - 2)))
accuracy = accuracy_score(all_data.iloc[train_frac:,8].values - 2, np.round(x_test_xgboost))
print("Accuracy: ", accuracy)
print(precision_recall_fscore_support(all_data.iloc[train_frac:,8].values - 2, np.round(x_test_xgboost)))

Mean Absolute Error Train: 0.4880507164957116
Accuracy:  0.5119492835042885
(array([0.73780934, 0.34036117]), array([0.45938327, 0.63082415]), array([0.56622012, 0.4421567 ]), array([2104367,  930543]))
Mean Absolute Error Test : 0.6705939569352308
Accuracy:  0.32940604306476917
(array([0.81622982, 0.20750288]), array([0.20502673, 0.81848764]), array([0.32773142, 0.33107235]), array([268843,  68370]))
