## Traffic Violations Decision Tree Model

In [1]:
# Import libraries for reading and processing the data
import numpy as np
import pandas as pd

In [63]:
# Read in the traffic violations dataset
df = pd.read_csv("Traffic_Violations.csv")

# The first 5 rows
df.head(5)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Date Of Stop,Time Of Stop,Agency,SubAgency,Description,Location,Latitude,Longitude,Accident,Belts,...,Charge,Article,Contributed To Accident,Race,Gender,Driver City,Driver State,DL State,Arrest Type,Geolocation
0,09/24/2013,17:11:00,MCP,"3rd district, Silver Spring",DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGI...,8804 FLOWER AVE,,,No,No,...,13-401(h),Transportation Article,No,BLACK,M,TAKOMA PARK,MD,MD,A - Marked Patrol,
1,08/29/2017,10:19:00,MCP,"2nd district, Bethesda",DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC...,WISCONSIN AVE@ ELM ST,38.981725,-77.092757,No,No,...,21-201(a1),Transportation Article,No,WHITE,F,FAIRFAX STATION,VA,VA,A - Marked Patrol,"(38.981725, -77.0927566666667)"
2,12/01/2014,12:52:00,MCP,"6th district, Gaithersburg / Montgomery Village",FAILURE STOP AND YIELD AT THRU HWY,CHRISTOPHER AVE/MONTGOMERY VILLAGE AVE,39.162888,-77.229088,No,No,...,21-403(b),Transportation Article,No,BLACK,F,UPPER MARLBORO,MD,MD,A - Marked Patrol,"(39.1628883333333, -77.2290883333333)"
3,08/29/2017,09:22:00,MCP,"3rd district, Silver Spring",FAILURE YIELD RIGHT OF WAY ON U TURN,CHERRY HILL RD./CALVERTON BLVD.,39.056975,-76.954633,No,No,...,21-402(b),Transportation Article,No,BLACK,M,FORT WASHINGTON,MD,MD,A - Marked Patrol,"(39.056975, -76.9546333333333)"
4,08/28/2017,23:41:00,MCP,"6th district, Gaithersburg / Montgomery Village",FAILURE OF DR. TO MAKE LANE CHANGE TO AVAIL. L...,355 @ SOUTH WESTLAND DRIVE,,,No,No,...,21-405(e1),Transportation Article,No,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,


In [64]:
# general information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1292399 entries, 0 to 1292398
Data columns (total 35 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   Date Of Stop             1292399 non-null  object 
 1   Time Of Stop             1292399 non-null  object 
 2   Agency                   1292399 non-null  object 
 3   SubAgency                1292389 non-null  object 
 4   Description              1292390 non-null  object 
 5   Location                 1292397 non-null  object 
 6   Latitude                 1197045 non-null  float64
 7   Longitude                1197045 non-null  float64
 8   Accident                 1292399 non-null  object 
 9   Belts                    1292399 non-null  object 
 10  Personal Injury          1292399 non-null  object 
 11  Property Damage          1292399 non-null  object 
 12  Fatal                    1292399 non-null  object 
 13  Commercial License       1292399 non-null 

### Data preprocessing

In [65]:
# check missing data
df.isnull().sum()

Date Of Stop                   0
Time Of Stop                   0
Agency                         0
SubAgency                     10
Description                    9
Location                       2
Latitude                   95354
Longitude                  95354
Accident                       0
Belts                          0
Personal Injury                0
Property Damage                0
Fatal                          0
Commercial License             0
HAZMAT                         0
Commercial Vehicle             0
Alcohol                        0
Work Zone                      0
State                         59
VehicleType                    0
Year                        8074
Make                          57
Model                        187
Color                      16127
Violation Type                 0
Charge                         0
Article                    65169
Contributed To Accident        0
Race                           0
Gender                         0
Driver Cit

In [66]:
# delete columns which will not be used in the modelling
df.drop(['Date Of Stop', 'Time Of Stop', 'Agency', 'SubAgency', 'Description', 'Location', 'Latitude'], axis=1, inplace=True)

df.drop(['Longitude', 'Accident', 'HAZMAT', 'Work Zone', 'State', 'VehicleType', 'Year', 'Make', 'Model'], axis=1, inplace=True)

df.drop(['Driver City', 'Driver State', 'DL State', 'Arrest Type', 'Geolocation', 'Article', 'Charge'], axis=1, inplace=True)

In [67]:
# check columns after dropping of columns
df.head(3)

Unnamed: 0,Belts,Personal Injury,Property Damage,Fatal,Commercial License,Commercial Vehicle,Alcohol,Color,Violation Type,Contributed To Accident,Race,Gender
0,No,No,No,No,No,No,No,BLACK,Citation,No,BLACK,M
1,No,No,No,No,No,No,No,GREEN,Citation,No,WHITE,F
2,No,No,Yes,No,No,No,No,SILVER,Citation,No,BLACK,F


In [68]:
# delete the ESERO and SERO from the Violation type column
df.drop(df.index[df['Violation Type'] == 'ESERO'], inplace = True)
df.drop(df.index[df['Violation Type'] == 'SERO'], inplace = True)

In [69]:
# check if ESERO and SERO have been deleted
df['Violation Type'].value_counts()

Citation    607150
Name: Violation Type, dtype: int64

In [71]:
# check the value count for Color column
df['Color'].value_counts()

BLACK          247365
SILVER         224780
WHITE          186788
GRAY           135241
RED             96335
BLUE            89976
GREEN           46426
GOLD            39024
BLUE, DARK      25681
TAN             25444
MAROON          21386
BLUE, LIGHT     16321
BEIGE           14086
GREEN, DK       13530
GREEN, LGT       7001
BROWN            5766
YELLOW           4623
ORANGE           4304
BRONZE           2809
PURPLE           2342
MULTICOLOR        911
CREAM             800
COPPER            388
PINK              175
CAMOUFLAGE         24
CHROME             21
Name: Color, dtype: int64

In [72]:
# fill the Color column with majority color which is black
df['Color'].fillna(df['Color'].value_counts().index[0],inplace=True)

In [79]:
# drop Belts, personal Injury, Commercial license, vehicle and Contributed to accident columns
df.drop(['Belts', 'Personal Injury', 'Property Damage', 'Commercial License'], axis=1, inplace=True)
df.drop(['Commercial Vehicle', 'Alcohol', 'Contributed To Accident'], axis=1, inplace=True)

In [81]:
# dropping fatal column
df.drop('Fatal', axis=1, inplace=True)

## Declaring the target variables and feature variables

In [121]:
# feature variables and target variables
X = df.drop(['Violation Type'], axis=1)
y = df['Violation Type'].values.reshape(-1,1)

In [122]:
print(X.shape)
print(y.shape)

(1227253, 3)
(1227253, 1)


In [123]:
# Splitting into train sets and test sets
# splitting 70/30
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [124]:
# Shape of train and test set
print(X_train.shape)
print(X_test.shape)

(859077, 3)
(368176, 3)


In [125]:
# encoding the variables with one hot encoding
from sklearn.preprocessing import OneHotEncoder

# one-hot encode input variables
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)

# one-hot encode target variables
onehot_encoder.fit(y_train)
y_train = onehot_encoder.transform(y_train)
y_test = onehot_encoder.transform(y_test)

## Decision Tree Model

In [130]:
# importing decision tree library
from sklearn.tree import DecisionTreeClassifier

# define the model
mdl = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)

# fit the model
mdl.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=0)

In [131]:
# predict on test set
ypred = mdl.predict(X_test)

In [132]:
# accuracy of the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, ypred)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 55.02
