**Accident Severity Analysis**

In [1]:
import numpy as np  
import pandas as pd
import datetime
import os
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MaxAbsScaler

**Data Import**

In [3]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print(filepath)

/kaggle/input/us-accidents/US_Accidents_June20.csv


In [4]:
data = pd.read_csv(filepath)

In [5]:
data.columns

Index(['ID', 'Source', 'TMC', 'Severity', 'Start_Time', 'End_Time',
       'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)',
       'Description', 'Number', 'Street', 'Side', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [6]:
np.unique(data['Severity'])

array([1, 2, 3, 4])

**Feature Extraction**

In [7]:
X = pd.DataFrame()

In [8]:
def str_to_datetime(x):
    x = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    return x

def get_acc_len(x):
    x = x.total_seconds() / 60.0
    return x        

In [9]:
#start time, end time --> length of accident
end_time = data['End_Time'].apply(str_to_datetime)
start_time = data['Start_Time'].apply(str_to_datetime)
data['Accident_Length'] = end_time - start_time 
X['Accident_Length'] = data['Accident_Length'].apply(get_acc_len)
X['Accident_Length']

0          314.000000
1           30.000000
2           30.000000
3           30.000000
4           30.000000
              ...    
3513612     28.600000
3513613     26.883333
3513614     28.466667
3513615     29.350000
3513616     29.416667
Name: Accident_Length, Length: 3513617, dtype: float64

In [10]:
X['Accident_Distance'] = data['Distance(mi)']
X['Accident_Distance'].fillna(-500.0, inplace = True) 

In [11]:
print(X['Accident_Distance'].mean())
print(X['Accident_Distance'].max())
print(X['Accident_Distance'].min())

0.2816166867384315
333.63000488299997
0.0


In [12]:
X['Temp'] = data['Temperature(F)']
X['Temp'].fillna(-500.0, inplace = True) 
X['Temp']

0          36.9
1          37.9
2          36.0
3          35.1
4          36.0
           ... 
3513612    86.0
3513613    70.0
3513614    73.0
3513615    71.0
3513616    79.0
Name: Temp, Length: 3513617, dtype: float64

In [13]:
print(X['Temp'].mean())
print(X['Temp'].max())
print(X['Temp'].min())

51.42255624332331
170.6
-500.0


In [14]:
X['Humidity'] = data['Humidity(%)']
X['Humidity'].fillna(-10.0, inplace = True) 
X['Humidity']

0           91.0
1          100.0
2          100.0
3           96.0
4           89.0
           ...  
3513612     40.0
3513613     73.0
3513614     64.0
3513615     81.0
3513616     47.0
Name: Humidity, Length: 3513617, dtype: float64

In [15]:
print(X['Humidity'].mean())
print(X['Humidity'].max())
print(X['Humidity'].min())

63.624498059976375
100.0
-10.0


In [16]:
X['Precipitation'] = data['Precipitation(in)']
X['Precipitation'].fillna(-10.0, inplace = True) 
X['Precipitation']

0           0.02
1           0.00
2         -10.00
3         -10.00
4         -10.00
           ...  
3513612     0.00
3513613     0.00
3513614     0.00
3513615     0.00
3513616     0.00
Name: Precipitation, Length: 3513617, dtype: float64

In [17]:
print(X['Precipitation'].mean())
print(X['Precipitation'].max())
print(X['Precipitation'].min())

-5.759011884333611
25.0
-10.0


In [18]:
X['Wind_Chill'] = data['Wind_Chill(F)']
X['Wind_Chill'].fillna(-500.0, inplace = True) 
X['Wind_Chill']

0         -500.0
1         -500.0
2           33.3
3           31.0
4           33.3
           ...  
3513612     86.0
3513613     70.0
3513614     73.0
3513615     71.0
3513616     79.0
Name: Wind_Chill, Length: 3513617, dtype: float64

In [19]:
print(X['Wind_Chill'].mean())
print(X['Wind_Chill'].max())
print(X['Wind_Chill'].min())

-240.77838879416709
115.0
-500.0


In [20]:
X['Pressure'] = data['Pressure(in)']
X['Pressure'].fillna(-500.0, inplace = True) 
X['Pressure']

0          29.68
1          29.65
2          29.67
3          29.64
4          29.65
           ...  
3513612    28.92
3513613    29.39
3513614    29.74
3513615    29.62
3513616    28.63
Name: Pressure, Length: 3513617, dtype: float64

In [21]:
print(X['Pressure'].mean())
print(X['Pressure'].max())
print(X['Pressure'].min())

21.319353845340995
57.74
-500.0


In [22]:
X['Visibility'] = data['Visibility(mi)']
X['Visibility'].fillna(-500.0, inplace = True) 
X['Visibility']

0          10.0
1          10.0
2          10.0
3           9.0
4           6.0
           ... 
3513612    10.0
3513613    10.0
3513614    10.0
3513615    10.0
3513616     7.0
Name: Visibility, Length: 3513617, dtype: float64

In [23]:
print(X['Visibility'].mean())
print(X['Visibility'].max())
print(X['Visibility'].min())

-1.8688801995208821
140.0
-500.0


In [24]:
X['Severity'] = data['Severity']

In [25]:
X.isna().any()

Accident_Length      False
Accident_Distance    False
Temp                 False
Humidity             False
Precipitation        False
Wind_Chill           False
Pressure             False
Visibility           False
Severity             False
dtype: bool

In [26]:
unique, counts = np.unique(X['Severity'].values, return_counts=True)
dict(zip(unique, counts))

{1: 29174, 2: 2373210, 3: 998913, 4: 112320}

In [28]:
X = X[X['Severity'] < 4] # extract 2 and 3 only
X

Unnamed: 0,Accident_Length,Accident_Distance,Temp,Humidity,Precipitation,Wind_Chill,Pressure,Visibility,Severity
0,314.000000,0.010,36.9,91.0,0.02,-500.0,29.68,10.0,3
1,30.000000,0.010,37.9,100.0,0.00,-500.0,29.65,10.0,2
2,30.000000,0.010,36.0,100.0,-10.00,33.3,29.67,10.0,2
3,30.000000,0.010,35.1,96.0,-10.00,31.0,29.64,9.0,3
4,30.000000,0.010,36.0,89.0,-10.00,33.3,29.65,6.0,2
...,...,...,...,...,...,...,...,...,...
3513612,28.600000,0.543,86.0,40.0,0.00,86.0,28.92,10.0,2
3513613,26.883333,0.338,70.0,73.0,0.00,70.0,29.39,10.0,2
3513614,28.466667,0.561,73.0,64.0,0.00,73.0,29.74,10.0,2
3513615,29.350000,0.772,71.0,81.0,0.00,71.0,29.62,10.0,2


In [29]:
X = X[X['Severity'] > 1]
X

Unnamed: 0,Accident_Length,Accident_Distance,Temp,Humidity,Precipitation,Wind_Chill,Pressure,Visibility,Severity
0,314.000000,0.010,36.9,91.0,0.02,-500.0,29.68,10.0,3
1,30.000000,0.010,37.9,100.0,0.00,-500.0,29.65,10.0,2
2,30.000000,0.010,36.0,100.0,-10.00,33.3,29.67,10.0,2
3,30.000000,0.010,35.1,96.0,-10.00,31.0,29.64,9.0,3
4,30.000000,0.010,36.0,89.0,-10.00,33.3,29.65,6.0,2
...,...,...,...,...,...,...,...,...,...
3513612,28.600000,0.543,86.0,40.0,0.00,86.0,28.92,10.0,2
3513613,26.883333,0.338,70.0,73.0,0.00,70.0,29.39,10.0,2
3513614,28.466667,0.561,73.0,64.0,0.00,73.0,29.74,10.0,2
3513615,29.350000,0.772,71.0,81.0,0.00,71.0,29.62,10.0,2


In [30]:
X.shape

(3372123, 9)

In [31]:
unique, counts = np.unique(X['Severity'].values, return_counts=True)
dict(zip(unique, counts))

{2: 2373210, 3: 998913}

In [33]:
X_samp = X[X['Severity'] == 3].sample(n=499456, replace=True) # class 3 * 1.5
X_over = pd.concat([X, X_samp], axis=0)
X_over = X

In [34]:
num_examples = int(len(X_over)*.8)
num_examples


2697698

In [35]:
X_over.iloc[:,:8]

Unnamed: 0,Accident_Length,Accident_Distance,Temp,Humidity,Precipitation,Wind_Chill,Pressure,Visibility
0,314.000000,0.010,36.9,91.0,0.02,-500.0,29.68,10.0
1,30.000000,0.010,37.9,100.0,0.00,-500.0,29.65,10.0
2,30.000000,0.010,36.0,100.0,-10.00,33.3,29.67,10.0
3,30.000000,0.010,35.1,96.0,-10.00,31.0,29.64,9.0
4,30.000000,0.010,36.0,89.0,-10.00,33.3,29.65,6.0
...,...,...,...,...,...,...,...,...
3513612,28.600000,0.543,86.0,40.0,0.00,86.0,28.92,10.0
3513613,26.883333,0.338,70.0,73.0,0.00,70.0,29.39,10.0
3513614,28.466667,0.561,73.0,64.0,0.00,73.0,29.74,10.0
3513615,29.350000,0.772,71.0,81.0,0.00,71.0,29.62,10.0


In [36]:
transformer = MaxAbsScaler().fit(X_over.iloc[:,:8].values)
X_norm = transformer.transform(X_over.iloc[:,:8])

In [37]:
X_norm

array([[ 2.20822665e-04,  2.99733233e-05,  7.38000000e-02, ...,
        -1.00000000e+00,  5.93600000e-02,  2.00000000e-02],
       [ 2.10977069e-05,  2.99733233e-05,  7.58000000e-02, ...,
        -1.00000000e+00,  5.93000000e-02,  2.00000000e-02],
       [ 2.10977069e-05,  2.99733233e-05,  7.20000000e-02, ...,
         6.66000000e-02,  5.93400000e-02,  2.00000000e-02],
       ...,
       [ 2.00193797e-05,  1.68150344e-03,  1.46000000e-01, ...,
         1.46000000e-01,  5.94800000e-02,  2.00000000e-02],
       [ 2.06405899e-05,  2.31394056e-03,  1.42000000e-01, ...,
         1.42000000e-01,  5.92400000e-02,  2.00000000e-02],
       [ 2.06874737e-05,  1.60956746e-03,  1.58000000e-01, ...,
         1.58000000e-01,  5.72600000e-02,  1.40000000e-02]])

In [38]:
import torch
from torch.utils import data

class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, x, labels):
        'Initialization'
        self.labels = labels
        self.x = x
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        #ID = self.x[index]

        # Load data and get label
        
        curr_x = self.x[index]

        y = self.labels[index]
        out_y = np.zeros(2)
        out_y[int(y)] = 1    

        return curr_x, out_y
    
    


In [39]:
X_over['Severity'].values

array([3, 2, 2, ..., 2, 2, 2])

In [40]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42,sampling_strategy='minority')
X_res, y_res = sm.fit_resample(X_norm[:,:8], X_over['Severity'].values - 2)

Using TensorFlow backend.


In [41]:
y_res.reshape(len(y_res),1)

array([[1],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]])

In [42]:
X_y_smoted = np.concatenate((X_res,y_res.reshape(len(y_res),1)), axis=1)
X_y_smoted[0]

array([ 2.20822665e-04,  2.99733233e-05,  7.38000000e-02,  9.10000000e-01,
        8.00000000e-04, -1.00000000e+00,  5.93600000e-02,  2.00000000e-02,
        1.00000000e+00])

In [43]:
#shuffle data
np.random.shuffle(X_y_smoted)

In [44]:
X_y_smoted[0]

array([ 3.12246062e-05,  3.29706556e-04,  1.78200000e-01,  5.20000000e-01,
       -4.00000000e-01, -1.00000000e+00,  5.98600000e-02,  2.00000000e-02,
        0.00000000e+00])

In [45]:
num_examples = int(len(X_norm)*.8)
num_examples

2697698

In [46]:
X_res = X_y_smoted[:,:8]
y_res = X_y_smoted[:,8]

In [47]:
unique, counts = np.unique(y_res, return_counts=True)
dict(zip(unique, counts))

{0.0: 2373210, 1.0: 2373210}

In [48]:
#class_weights = np.asarray([1.0,1.0,1.0,1.0])
class_weights = np.asarray([1.0,1.0])
class_weights

array([1., 1.])

In [50]:
from xgboost import XGBClassifier

param_dist = {'n_estimators':100, 'max_depth':4}

xgboost_model = XGBClassifier(**param_dist)
xgboost_model.fit(X_res[:(num_examples)], y_res[:num_examples], verbose=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
#from xgboost import XGBRegressor

#xgboost_model = XGBRegressor(n_estimators=100,max_depth=4)
#xgboost_model.fit(X_res[:(num_examples)], y_res[:num_examples], verbose=False)

In [51]:
from sklearn.metrics import precision_recall_fscore_support

from sklearn.metrics import accuracy_score
# make predictions
x_test_xgboost = xgboost_model.predict(X_res[(num_examples):])

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error Test : " + str(mean_absolute_error(x_test_xgboost, y_res[num_examples:])))

accuracy = accuracy_score(y_res[num_examples:], np.round(x_test_xgboost))
print("Accuracy: ", accuracy)
print(precision_recall_fscore_support(y_res[num_examples:], np.round(x_test_xgboost)))

Mean Absolute Error Test : 0.3105989978142471
Accuracy:  0.6894010021857528
(array([0.67543761, 0.70579266]), array([0.72936615, 0.64942454]), array([0.70136676, 0.67643633]), array([1024506, 1024216]))


In [52]:

x_train_xgboost = xgboost_model.predict(X_res[:(num_examples)])

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error Train: " + str(mean_absolute_error(x_train_xgboost, y_res[:num_examples])))

accuracy = accuracy_score(y_res[:num_examples], np.round(x_train_xgboost))
print("Accuracy: ", accuracy)
print(precision_recall_fscore_support(y_res[:num_examples], np.round(x_train_xgboost)))

Mean Absolute Error Train: 0.3105039926633745
Accuracy:  0.6894960073366255
(array([0.67545417, 0.70597078]), array([0.72938317, 0.64961742]), array([0.70138356, 0.67662276]), array([1348704, 1348994]))


In [53]:
x_train_xgboost

array([1., 1., 0., ..., 1., 0., 1.])

In [54]:
np.unique(np.round(x_train_xgboost))

array([0., 1.])

In [55]:
print(confusion_matrix(np.round(x_train_xgboost),y_res[:num_examples]))
print(confusion_matrix(np.round(x_test_xgboost),y_res[num_examples:]))

[[983722 472664]
 [364982 876330]]
[[747240 359065]
 [277266 665151]]


In [None]:
X_over['Severity'][:num_examples]

In [56]:
x_norm_train_xgb = xgboost_model.predict(X_norm[:(num_examples)])
x_norm_test_xgb = xgboost_model.predict(X_norm[(num_examples):])

In [57]:
X_norm.shape

(3372123, 8)

In [58]:
x_norm_train_xgb.shape

(2697698,)

In [61]:
x_all = np.concatenate((X_norm[:(num_examples)], x_norm_train_xgb.reshape(len(x_norm_train_xgb),1)),axis=1)

In [65]:
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error Train: " + str(mean_absolute_error(x_norm_train_xgb, X_over['Severity'][:(num_examples)].values - 2)))


accuracy = accuracy_score(X_over['Severity'][:(num_examples)].values - 2, np.round(x_norm_train_xgb))
print("Accuracy: ", accuracy)
print(precision_recall_fscore_support(X_over['Severity'][:(num_examples)].values - 2, np.round(x_norm_train_xgb)))

Mean Absolute Error Train: 0.3493504461952376
Accuracy:  0.6506495538047624
(array([0.75418801, 0.47055305]), array([0.71245932, 0.5239297 ]), array([0.73273004, 0.49580894]), array([1813253,  884445]))


In [68]:
training_set = Dataset(x_norm_train_xgb, X_over['Severity'][:(num_examples)].values - 2)
training_generator = data.DataLoader(training_set, batch_size=8192, shuffle=True)

test_set = Dataset(x_norm_test_xgb, X_over['Severity'][(num_examples):].values - 2)
test_generator = data.DataLoader(test_set, batch_size=len(x_norm_test_xgb), shuffle=False)

In [69]:
len(training_generator)

330

In [70]:
len(test_generator)

1

In [71]:
len(x_test_xgboost)

2048722

In [72]:
class Accident_Model(nn.Module):
    def __init__(self):
        super(Accident_Model, self).__init__()
        self.w1 = nn.Linear(1,4,bias=True)
        nn.init.xavier_uniform_(self.w1.weight)
        nn.init.zeros_(self.w1.bias)
        self.bn1 = torch.nn.BatchNorm1d(4)
        
        
        self.w2 = nn.Linear(4,2,bias=True)
        nn.init.xavier_uniform_(self.w2.weight)
        nn.init.zeros_(self.w2.bias)
        #self.bn2 = torch.nn.BatchNorm1d(4)
        
        '''
        self.w3 = nn.Linear(4,2,bias=True)
        nn.init.xavier_uniform_(self.w3.weight)
        nn.init.zeros_(self.w3.bias)
        '''
    
    
    def forward(self, x):
        #print(x)
        x = x.float()
        x = self.bn1(F.relu(self.w1(x)))
        
        x = self.w2(x)
        '''
        x = self.bn2(F.relu(self.w2(x)))
        x = self.w3(x)
        
        x = self.w1(x.float())
        x = F.relu(self.w1(x.float()))
        x = self.w2(x)
        
        
        '''
        return x



In [73]:
from sklearn.metrics import precision_recall_fscore_support
class_weights

array([1., 1.])

In [74]:
print(x_train_xgboost.shape)
print(X.shape)
print(len(X['Severity']))
print(np.unique(X['Severity']))

(2697698,)
(3372123, 9)
3372123
[2 3]


In [75]:
def sigmoid(x):
    return np.divide(1.0, np.add(1.0, np.exp(-1.0*x)))

In [78]:
net = Accident_Model()

optimizer = torch.optim.Adam(net.parameters(), lr=0.001,weight_decay=0.95)
criterion = nn.BCEWithLogitsLoss(weight=torch.Tensor(class_weights))

for epoch in np.arange(1):
    for batch_idx,(batch, labels) in enumerate(training_generator):

        optimizer.zero_grad()
        outputs = net(batch.reshape(len(batch),1)).float()

        loss = criterion(outputs,np.squeeze(labels))
        loss.backward()
        optimizer.step()

        if batch_idx % 50 == 0:
            for test_batch, test_labels in test_generator:
                test_outputs = net(test_batch.reshape(len(test_batch),1))
                #test_outputs_ = F.sigmoid(test_outputs.detach())
                test_loss = criterion(test_outputs.float(),np.squeeze(test_labels))
                
                print("Step:",epoch)
                print("Train Loss: ", loss.item())
                print("Test Loss: ", test_loss.item())

                print(mean_absolute_error(test_outputs.detach().max(1).values,test_labels.detach().argmax(1)))
                
                print(confusion_matrix(sigmoid(outputs.detach()).argmax(1), labels.detach().argmax(1)))
                print(precision_recall_fscore_support(labels.detach().argmax(1), sigmoid(outputs.detach()).argmax(1), average=None))
                print("\n")
                print(confusion_matrix(sigmoid(test_outputs.detach()).argmax(1), test_labels.detach().argmax(1)))
                print(precision_recall_fscore_support(test_labels.detach().argmax(1), sigmoid(test_outputs.detach()).argmax(1), average=None))
                
                break;


Step: 0
Train Loss:  0.8317409870142001
Test Loss:  0.7989938374416283
0.5434053076290593
[[1608 1410]
 [3912 1262]]
(array([0.53280318, 0.24391187]), array([0.29130435, 0.47230539]), array([0.37666901, 0.32169258]), array([5520, 2672]))


[[120864  46399]
 [439093  68069]]
(array([0.72259854, 0.1342155 ]), array([0.21584515, 0.59465527]), array([0.3324001 , 0.21900166]), array([559957, 114468]))
Step: 0
Train Loss:  0.791226764466046
Test Loss:  0.7554513340736541
0.4588830592597207
[[1635 1382]
 [3901 1274]]
(array([0.54192907, 0.24618357]), array([0.2953396 , 0.47966867]), array([0.38232199, 0.32537352]), array([5536, 2656]))


[[120864  46399]
 [439093  68069]]
(array([0.72259854, 0.1342155 ]), array([0.21584515, 0.59465527]), array([0.3324001 , 0.21900166]), array([559957, 114468]))
Step: 0
Train Loss:  0.751645148753596
Test Loss:  0.7153671492026116
0.34987266622516044
[[1617 1409]
 [3872 1294]]
(array([0.5343688 , 0.25048393]), array([0.29458918, 0.47872734]), array([0.37980035