In [26]:
# data setup imports
import pandas as pd
from datetime import date
import time
import calendar
import matplotlib.pyplot as plt
import numpy as np

file_name = "../Traffic_Violations.csv"
df = pd.read_csv(file_name)

In [27]:
# Data cleaning/setup - general filtering of data

# Let's drop the columns we deem irrelevant!
filtered_cols_df = df.drop(['Driver City', 'Driver State','DL State', 'Arrest Type','Agency', 'SubAgency', 'Location', 'HAZMAT', 'Work Zone', 'Geolocation', 'Charge', 'Article', 
                           'Accident', 'Personal Injury', 'Property Damage', 'Fatal', 'Alcohol'], axis=1)

# Change all Date Of Stop dates to weekdays. Hopefully I can find patterns regarding when violations occur during the week
filtered_cols_df['Date Of Stop'] = filtered_cols_df['Date Of Stop'].map(lambda a: time.strftime("%A", time.strptime(a, "%m/%d/%Y")))

# Time Of Stop is now modified with just the hour of the day in military time
filtered_cols_df['Time Of Stop'] = filtered_cols_df['Time Of Stop'].map(lambda a: time.strftime("%H", time.strptime(a, "%H:%M:%S")))


In [28]:
# Data cleaning/setup for drunk driving incidents

# Take out only the drunk driving related incidents
drunk_driving_df = filtered_cols_df[(filtered_cols_df['Description'] == 'DRIVING VEHICLE WHILE UNDER THE INFLUENCE OF ALCOHOL')]
drunk_driving_df_2 = filtered_cols_df[(filtered_cols_df['Description'] == 'DRIVING WHILE IMPAIRED BY ALCOHOL')]
drunk_driving_df.append(drunk_driving_df_2)
drunk_driving_df = drunk_driving_df.drop(['Description', 'Model'], axis=1)

print("Shape of dataframe: ", drunk_driving_df.shape)

drunk_driving_df.head(3)

Shape of dataframe:  (17974, 16)


Unnamed: 0,Date Of Stop,Time Of Stop,Latitude,Longitude,Belts,Commercial License,Commercial Vehicle,State,VehicleType,Year,Make,Color,Violation Type,Contributed To Accident,Race,Gender
51,Tuesday,0,39.082775,-76.948403,No,No,No,MD,02 - Automobile,2000.0,FORD,TAN,Citation,No,BLACK,F
98,Tuesday,1,39.083948,-77.153498,No,No,No,MD,02 - Automobile,2016.0,TOYT,WHITE,Citation,No,WHITE,M
204,Saturday,22,39.058375,-77.048198,No,No,No,MD,02 - Automobile,1998.0,NISSAN,BLACK,Citation,No,BLACK,M


In [29]:
# Cleaning redundancies, renaming values and correcting mispellings for car make

car_make = {'ACCORD':'HONDA', 'ACCURA':'ACURA', 'ACUR':'ACURA', 'ACURA':'ACURA', \
            'ALTIMA':'NISSAN', 'AUDI':'AUDI', 'BENT':'MERCEDES BENZ', \
            'BENTLEY':'BENTLEY', 'BENZE':'MERCEDES BENZ', 'BLW':'BMW', \
            'BMW':'BMW', 'BMX':'BMW', 'BNW':'BMW', 'BUELL':'BUELL', \
            'BUIC':'BUICK', 'BUICK':'BUICK', 'BUIK':'BUICK', 'BWM':'BMW', \
            'CAD':'CADILLAC', 'CADALIC':'CADILLAC', 'CADALLIC':'CADILLAC', \
            'CADI':'CADILLAC', 'CADILAC':'CADILLAC', 'CADILLAC':'CADILLAC', \
            'CADILLAS':'CADILLAC', 'CADILLIAC':'CADILLAC', 'CADL':'CADILLAC', \
            'CADO':'CADILLAC', 'CAIDILAC':'CADILLAC', 'CEHV':'CHEVROLET', \
            'CEHVORLET':'CHEVROLET', 'CEHVY':'CHEVROLET', 'CHEV':'CHEVROLET', \
            'CHEVERLOT':'CHEVROLET', 'CHEVEROLET':'CHEVROLET', 'CHEVEY':'CHEVROLET', \
            'CHEVORLET':'CHEVROLET', 'CHEVOROLET':'CHEVROLET', 'CHEVR':'CHEVROLET', \
            'CHEVRLET':'CHEVROLET', 'CHEVRLETE':'CHEVROLET', 'CHEVROLERT':'CHEVROLET', \
            'CHEVROLET':'CHEVROLET', 'CHEVROLETY':'CHEVROLET', 'CHEVROLEY':'CHEVROLET', \
            'CHEVY':'CHEVROLET', 'CHEVYC':'CHEVROLET', 'CHRY':'CHRYSLER', 'CHRYL':'CHRYSLER', \
            'CHRYLSER':'CHRYSLER', 'CHRYS':'CHRYSLER', 'CHRYSL':'CHRYSLER', 'CHRYSLER':'CHRYSLER', \
            'CHRYSTLER':'CHRYSLER', 'CHYRSLER':'CHRYSLER', 'CHYSLER':'CHRYSLER', 'COGAR':'FORD', \
            'COOPER':'COOPER', 'CRYSLER':'CHRYSLER', 'DAEWOO':'DAEWOO', 'DAEWOOD':'DAEWOO', 'DIAMONDBACK':'DIAMONDBACK', \
            'DODG':'DODGE', 'DODGE':'DODGE', 'DODGVAL2006':'DODGE', 'DOGDE':'DODGE', 'ELANTRA':'HYUNDAI', 'FERR':'FERRARI', \
            'FERRARI':'FERRARI', 'FIAT':'FIAT', 'FORD':'FORD', 'GEO':'CHEVROLET', 'GMC':'GMC', 'HARL':'HARLEY DAVIDSON', \
            'HARLEY':'HARLEY DAVIDSON', 'HARLEY DAVIDSON':'HARLEY DAVIDSON', 'HENSIM':'HENSIM', 'HIND':'HONDA', \
            'HINDA':'HONDA', 'HINO':'HONDA', 'HIONDA':'HONDA', 'HODNA':'HONDA', 'HOMD':'HONDA', 'HOMDA':'HONDA', \
            'HONA':'HONDA', 'HONAD':'HONDA', 'HOND':'HONDA', 'HONDA':'HONDA', 'HONDAI':'HYUNDAI', 'HUANDAI':'HYUNDAI', \
            'HUDSON':'HUDSON', 'HUMM':'HUMMER', 'HUND':'HYUNDAI', 'HUNDAI':'HYUNDAI', 'HUYN':'HYUNDAI', 'HUYUNDAI':'HYUNDAI', \
            'HYAN':'HYUNDAI', 'HYND':'HYUNDAI', 'HYNDAI':'HYUNDAI', 'HYUANDI':'HYUNDAI', 'HYUDAI':'HYUNDAI', \
            'HYUMDAI':'HYUNDAI', 'HYUN':'HYUNDAI', 'HYUNADAI':'HYUNDAI', 'HYUND':'HYUNDAI', 'HYUNDA':'HYUNDAI', \
            'HYUNDAI':'HYUNDAI', 'HYUNDAIR':'HYUNDAI', 'HYUNDAY':'HYUNDAI', 'HYUNDI':'HYUNDAI', 'HYUNDIA':'HYUNDAI', \
            'INF':'INFINITI', 'INFI':'INFINITI', 'INFIITI':'INFINITI', 'INFIMITI':'INFINITI','INFIN':'INFINITI', \
            'INFINIT':'INFINITI', 'INFINITI':'INFINITI', 'INFINITY':'INFINITI', 'INFINTI':'INFINITI', 'INFINTY':'INFINITI', \
            'INFINTI':'INFINITI', 'INFINTY':'INFINITI', 'INFNITY':'INFINITI', 'INIFINITI':'INFINITI', 'INT':'INFINITI', \
            'ISU':'ISUZU', 'ISUZ':'ISUZU', 'ISUZU':'ISUZU', 'ISZU':'ISUZU', 'ISZUZU':'ISUZU', 'IZUZU':'ISUZU', \
            'JAG':'JAGUAR', 'JAGU':'JAGUAR', 'JAQUAR':'JAGUAR', 'JEEF':'JEEP', 'JEEP':'JEEP', 'JOHN DEERE':'JOHN DEERE', \
            'KAWASAKI':'KAWASAKI', 'KAWK':'KAWASAKI', 'KIA':'KIA', 'KUBOTA':'KUBOTA', 'KYMCO':'KYMCO', 'L300':'MITSUBISHI', \
            'LAND':'LAND ROVER', 'LAND ROVER':'LAND ROVER', 'LANDROVER':'LAND ROVER', 'LEX':'LEXUS', 'LEXS':'LEXUS', \
            'LEXU':'LEXUS', 'LEXUS':'LEXUS', 'LEXUZ':'LEXUS', 'LIN':'LINCOLN', 'LINC':'LINCOLN', 'LINCLN':'LINCOLN', \
            'LINCOLN':'LINCOLN', 'LUXUS':'LEXUS', 'MADA':'MAZDA', 'MADZA':'MAZDA', 'MASERSTI':'MASERATI', 'MAZ':'MAZDA', \
            'MAZA':'MAZDA', 'MAZADA':'MAZDA', 'MAZD':'MAZDA', 'MAZD SW':'MAZDA', 'MAZDA':'MAZDA', 'MECURY':'FORD', \
            'MER':'FORD', 'MERCDES':'MERCEDES BENZ', 'MERCEDES BENZ':'MERCEDES BENZ', 'MERCEDES-BENZ':'MERCEDES BENZ', \
            'MERCEDESE':'MERCEDES BENZ', 'MERCEDEZ':'MERCEDES BENZ', 'MERCEDEZ BENZ':'MERCEDES BENZ', \
            'MERCEDES-BENZ':'MERCEDES BENZ','MERCEDESE':'MERCEDES BENZ','MERCEDEZ':'MERCEDES BENZ', \
            'MERCEDEZ BENZ':'MERCEDES BENZ','MERCEDEZ-BENZ':'MERCEDES BENZ','MERCEDS':'MERCEDES BENZ','MERCURY':'FORD',\
            'MERECEDES':'MERCEDES BENZ','MERZ':'MERZ', 'MERZ B':'MERZ','MERZEDES':'MERCEDES BENZ', 'MINI':'MINI', \
            'MINI COOPER':'MINI','MITIS':'MITSUBISHI','MITISIBISHI':'MITSUBISHI','MITISUBISHI':'MITSUBISHI', \
            'MITS':'MITSUBISHI','MITSABUSHI':'MITSUBISHI', 'MITSBISHI':'MITSUBISHI','MITSIBISH':'MITSUBISHI', \
            'MITSIBISHI':'MITSUBISHI','MITSU':'MITSUBISHI','MITSUBIHI':'MITSUBISHI','MITSUBISH':'MITSUBISHI', \
            'MITSUBISHI':'MITSUBISHI','MITSUBISHUI':'MITSUBISHI','MITTSUBISHI':'MITSUBISHI','MITUBISHI':'MITSUBISHI', \
            'MITZ':'MITSUBISHI','MNNI':'MINI','MONGOOSE':'MONGOOSE','MUSTANG':'FORD','NISS':'NISSAN','NISSA':'NISSAN', \
            'NISSAB':'NISSAN','NISSAM':'NISSAN','NISSAN':'NISSAN','NISSIAN':'NISSAN','NISSON':'NISSAN', 'NISSSN':'NISSAN', \
            'OLDS':'OLDSMOBILE','OLDSMOBILE':'OLDSMOBILE','PLYM':'PLYMOUTH','PLYMOTH':'PLYMOUTH','PLYMOUTH':'PLYMOUTH' ,\
            'PNT':'PONTIAC','PONT':'PONTIAC','PONTAIC':'PONTIAC','PONTIAC':'PONTIAC','PORCHE':'PORSCHE','PORCSCHE':'PORSCHE',\
            'PORS':'PORSCHE','PORSCHE':'PORSCHE','RAM':'RAM','RANG':'LAND ROVER','RANGE':'LAND ROVER','RANGE ROVER':'LAND ROVER',\
            'RAPTOR':'FORD','ROVER':'LAND ROVER','SAA':'SAAB','SAAB':'SAAB','SABB':'SAAB','SAT':'SATURN','SATR':'SATURN', \
            'SATU':'SATURN','SATURN':'SATURN','SCI':'SCION','SCIO':'SCION','SCION':'SCION','SCIONIA':'SCION', 'SCOIN':'SCION', \
            'SMART':'MERCEDES BENZ','SMARTCAR':'MERCEDES BENZ','STRN':'SATURN','SU BARU':'SUBARU','SUB':'SUBARU','SUBA':'SUBARU', \
            'SUBARU':'SUBARU','SUBR':'SUBARU','SUBURU':'SUBARU','SUZI':'SUZUKI','SUZU':'SUZUKI','SUZUKI':'SUZUKI','SX4':'SUZUKI', \
            'TAOT':'TAOTAO','TAOTAO':'TAOTAO','TAOTOA50':'TAOTAO','TESLA':'TESLA','TOMO':'HONDA','TOT':'TOYOTA','TOTOTA':'TOYOTA', \
            'TOTY':'TOYOTA','TOTYOTA':'TOYOTA','TOY':'TOYOTA','TOYATA':'TOYOTA','TOYO':'TOYOTA','TOYOA':'TOYOTA','TOYORA':'TOYOTA',\
            'TOYOTA':'TOYOTA','TOYOTA SCION':'TOYOTA','TOYOTAO':'TOYOTA','TOYOYA':'TOYOTA','TOYPTA':'TOYOTA','TOYT':'TOYOTA', \
            'TOYTA':'TOYOTA','TOYTOA':'TOYOTA','TOYTOTA':'TOYOTA','TOYT`':'TOYOTA','TRIUMPH':'TRIUMPH','VESPA':'VESPA', \
            'VOKLS':'VOLKSWAGEN','VOKS':'VOLKSWAGEN','VOLK':'VOLKSWAGEN','VOLKS':'VOLKSWAGEN','VOLKS WAGON':'VOLKSWAGEN', \
            'VOLKSWAGAN':'VOLKSWAGEN','VOLKSWAGEN':'VOLKSWAGEN','VOLKSWAGON':'VOLKSWAGEN','VOLKWAGEN':'VOLKSWAGEN', \
            'VOLTS':'VOLKSWAGEN','VOLTSWAGON':'VOLKSWAGEN','VOLV':'VOLVO','VOLVO':'VOLVO','VOVLO':'VOLVO','VW':'VOLKSWAGEN', \
            'WILDFIRE':'DODGE','WOLKSWAGEN':'VOLKSWAGEN','WV':'VOLKSWAGEN','YAMAHA':'YAMAHA'}

# print(len(drunk_driving_df['Make'].value_counts()))
# print(drunk_driving_df['Make'].sort_values().unique())

print("initial shape", drunk_driving_df.shape)


drunk_driving_df['Make'] = drunk_driving_df['Make'].map(car_make)
print(len(drunk_driving_df['Make'].value_counts()))

drunk_driving_df['Make'].value_counts(dropna=False)

drunk_driving_df = drunk_driving_df.dropna()
print("drunk driving dropped nan ", drunk_driving_df.shape)
drunk_driving_df.head(3)

initial shape (17974, 16)
61
drunk driving dropped nan  (13727, 16)


Unnamed: 0,Date Of Stop,Time Of Stop,Latitude,Longitude,Belts,Commercial License,Commercial Vehicle,State,VehicleType,Year,Make,Color,Violation Type,Contributed To Accident,Race,Gender
51,Tuesday,0,39.082775,-76.948403,No,No,No,MD,02 - Automobile,2000.0,FORD,TAN,Citation,No,BLACK,F
98,Tuesday,1,39.083948,-77.153498,No,No,No,MD,02 - Automobile,2016.0,TOYOTA,WHITE,Citation,No,WHITE,M
204,Saturday,22,39.058375,-77.048198,No,No,No,MD,02 - Automobile,1998.0,NISSAN,BLACK,Citation,No,BLACK,M


In [30]:
# Create dictionaries for mapping to integer values
# Convert data in dataframes to numbers via mapping function

drunk_driving_df['Time Of Stop'] = drunk_driving_df['Time Of Stop'].astype(int)
drunk_driving_df['Year'] = drunk_driving_df['Year'].astype(float)

weekdays = {k: v for v, k in enumerate(drunk_driving_df['Date Of Stop'].sort_values().unique())}
print("weekdays dict \n", weekdays)
print()
drunk_driving_df['Date Of Stop'] = drunk_driving_df['Date Of Stop'].map(weekdays)

states = {k: v for v, k in enumerate(drunk_driving_df['State'].sort_values().unique())}
print("State dict \n", states)
print()
drunk_driving_df['State'] = drunk_driving_df['State'].map(states)

vehicle_types = {k: v for v, k in enumerate(drunk_driving_df['VehicleType'].sort_values().unique())}
print("VehicleType dict \n", vehicle_types)
print()
drunk_driving_df['VehicleType'] = drunk_driving_df['VehicleType'].map(vehicle_types)

makes = {k: v for v, k in enumerate(drunk_driving_df['Make'].sort_values().unique())}
print("Make dict \n", makes)
print()
drunk_driving_df['Make'] = drunk_driving_df['Make'].map(makes)


colors = {k: v for v, k in enumerate(drunk_driving_df['Color'].sort_values().unique())}
print("Color dict \n", colors)
print()
drunk_driving_df['Color'] = drunk_driving_df['Color'].map(colors)


violations = {k: v for v, k in enumerate(drunk_driving_df['Violation Type'].sort_values().unique())}
print("violations dict \n", violations)
print()
drunk_driving_df['Violation Type'] = drunk_driving_df['Violation Type'].map(violations)


races = {k: v for v, k in enumerate(drunk_driving_df['Race'].sort_values().unique())}
print("races dict \n", races)
print()
drunk_driving_df['Race'] = drunk_driving_df['Race'].map(races)


gender = {k: v for v, k in enumerate(drunk_driving_df['Gender'].sort_values().unique())}
print("gender dict \n", gender)
print()
drunk_driving_df['Gender'] = drunk_driving_df['Gender'].map(gender)


yes_no = {'Yes': 1, 'No': 0}
drunk_driving_df['Contributed To Accident'] = drunk_driving_df['Contributed To Accident'].map(yes_no)
drunk_driving_df['Belts'] = drunk_driving_df['Belts'].map(yes_no)
drunk_driving_df['Commercial License'] = drunk_driving_df['Commercial License'].map(yes_no)
drunk_driving_df['Commercial Vehicle'] = drunk_driving_df['Commercial Vehicle'].map(yes_no)

print("Shape before dropping NAN:", drunk_driving_df.shape)

drunk_driving_df = drunk_driving_df.dropna()

print("Shape after dropping NAN:", drunk_driving_df.shape)

drunk_driving_df.head(3)

weekdays dict 
 {'Friday': 0, 'Monday': 1, 'Saturday': 2, 'Sunday': 3, 'Thursday': 4, 'Tuesday': 5, 'Wednesday': 6}

State dict 
 {'AK': 0, 'AL': 1, 'AR': 2, 'AZ': 3, 'CA': 4, 'CT': 5, 'DC': 6, 'DE': 7, 'FL': 8, 'GA': 9, 'HI': 10, 'IA': 11, 'ID': 12, 'IL': 13, 'IN': 14, 'KS': 15, 'KY': 16, 'LA': 17, 'MA': 18, 'MD': 19, 'ME': 20, 'MI': 21, 'MN': 22, 'MO': 23, 'MS': 24, 'MT': 25, 'NB': 26, 'NC': 27, 'ND': 28, 'NE': 29, 'NH': 30, 'NJ': 31, 'NM': 32, 'NV': 33, 'NY': 34, 'OH': 35, 'OK': 36, 'ON': 37, 'OR': 38, 'PA': 39, 'RI': 40, 'SC': 41, 'SD': 42, 'TN': 43, 'TX': 44, 'US': 45, 'VA': 46, 'VT': 47, 'WA': 48, 'WV': 49, 'WY': 50, 'XX': 51}

VehicleType dict 
 {'01 - Motorcycle': 0, '02 - Automobile': 1, '03 - Station Wagon': 2, '04 - Limousine': 3, '05 - Light Duty Truck': 4, '06 - Heavy Duty Truck': 5, '07 - Truck/Road Tractor': 6, '08 - Recreational Vehicle': 7, '09 - Farm Vehicle': 8, '13 - Ambulance(Emerg)': 9, '15 - Fire(Emerg)': 10, '19 - Moped': 11, '20 - Commercial Rig': 12, '22 - Mob

Unnamed: 0,Date Of Stop,Time Of Stop,Latitude,Longitude,Belts,Commercial License,Commercial Vehicle,State,VehicleType,Year,Make,Color,Violation Type,Contributed To Accident,Race,Gender
51,5,0,39.082775,-76.948403,0,0,0,19,1,2000.0,15,21,0,0,1,0
98,5,1,39.083948,-77.153498,0,0,0,19,1,2016.0,55,22,0,0,5,1
204,2,22,39.058375,-77.048198,0,0,0,19,1,1998.0,42,1,0,0,1,1


In [62]:
# Try to oversample by making fake but similar data entries with
# positive "contributed to accident" features
# !pip install imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate feature columns from class label column
drunk_label_df = drunk_driving_df['Contributed To Accident']
drunk_features_df = drunk_driving_df.drop(['Contributed To Accident'], axis=1)

# Rescale data!
scaler_drunk_features_df = StandardScaler().fit(drunk_features_df)
rescaled_drunk_features_df = scaler_drunk_features_df.transform(drunk_features_df)

drunk_df_train_features, drunk_df_test_features, drunk_df_train_label, drunk_df_test_label = train_test_split(rescaled_drunk_features_df, drunk_label_df, train_size = 0.8, test_size = 0.2)
print("Training Set Size before oversampling: ", len(drunk_df_train_label))
print("Counts of label 1: ", sum(drunk_df_train_label==1))
print("Counts of label 0: ", sum(drunk_df_train_label==0))
print()

sm = SMOTE(random_state=2, k_neighbors=3)

drunk_df_train_features, drunk_df_train_label = sm.fit_sample(drunk_df_train_features, drunk_df_train_label.ravel())

print("Training Set Size after oversampling: ", len(drunk_df_train_label))
print("Counts of label 1: ", sum(drunk_df_train_label==1))
print("Counts of label 0: ", sum(drunk_df_train_label==0))

Training Set Size before oversampling:  10981
Counts of label 1:  756
Counts of label 0:  10225

Training Set Size after oversampling:  20450
Counts of label 1:  10225
Counts of label 0:  10225


  return self.partial_fit(X, y)
  


In [63]:
# Export cleaned drunk driving data to csv
export_csv = drunk_driving_df.to_csv (r'..\drunk_driving_violations.csv', index = None, header=True)

export_csv = pd.DataFrame(drunk_df_train_features).to_csv (r'..\drunk_df_train_features_oversampled.csv', index = None, header=None)
export_csv = pd.DataFrame(drunk_df_train_label).to_csv (r'..\drunk_df_train_label_oversampled.csv', index = None, header=None)

export_csv = pd.DataFrame(drunk_df_test_features).to_csv (r'..\drunk_df_test_features.csv', index = None, header=None)
export_csv = pd.DataFrame(drunk_df_test_label).to_csv (r'..\drunk_df_test_label.csv', index = None, header=None)
