In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
import os
from pathlib import Path

In [22]:
cols = ['agency','subagency','latitude','longitude','accident','belts','personal_injury','property_damage','fatal','commercial_license','hazmat','commercial_vehicle','alcohol','work_zone','search_conducted','search_disposition','search_outcome','search_reason','search_reason_for_stop','search_type','search_arrest_reason','state','vehicletype','year','color','class','charge','article','contributed_to_accident','race','gender','driver_state','dl_state','arrest_type']
df = pd.read_csv('traffic_violations_edited.csv', names=cols, index_col=False)
# df.columns

In [23]:
cols = ['accident', 'belts', 'personal_injury', 'property_damage', 'fatal', 'commercial_license', 'commercial_vehicle', 
        'alcohol', 'work_zone', 'search_conducted', 'search_outcome','race', 'gender']
# removed dl_state and driver_state: only ~5% are in different states
# removed arrest_type: not relevant for our task. We are only interested in if an arrest took place
df = df[cols]
len(df)

1578154

In [24]:
df.isin(['?']).sum(axis=0)
df['search_conducted'] = df['search_conducted'].replace('?',np.nan)
df['search_outcome'] = df['search_outcome'].replace('?',np.nan)

df.dropna(how='any',inplace=True)
len(df)

958160

In [25]:
df['search_outcome'] = df['search_outcome'].map({'Warning': 1, 'Citation': 1, 'SERO': 1, "'Recovered Evidence'": 1, 'Arrest': 0}).astype(int)
df['accident'] = df['accident'].map({'No': 0, 'Yes': 1}).astype(int)
df['belts'] = df['belts'].map({'No': 0, 'Yes': 1}).astype(int)
df['personal_injury'] = df['personal_injury'].map({'No': 0, 'Yes': 1}).astype(int)
df['property_damage'] = df['property_damage'].map({'No': 0, 'Yes': 1}).astype(int)
df['fatal'] = df['fatal'].map({'No': 0, 'Yes': 1}).astype(int)
df['commercial_license'] = df['commercial_license'].map({'No': 0, 'Yes': 1}).astype(int)
df['commercial_vehicle'] = df['commercial_vehicle'].map({'No': 0, 'Yes': 1}).astype(int)
df['alcohol'] = df['alcohol'].map({'No': 0, 'Yes': 1}).astype(int)
df['work_zone'] = df['work_zone'].map({'No': 0, 'Yes': 1}).astype(int)
df['search_conducted'] = df['search_conducted'].map({'No': 0, 'Yes': 1}).astype(int)
df['race'] = df['race'].map({'OTHER': 1, 'BLACKLIVESMATTER': 0}).astype(int)
df['gender'] = df['gender'].map({'M': 2, 'F': 1, 'U':0}).astype(int)
df = df.reset_index(drop=True)

In [27]:
df.to_csv('traffic_violations_cleaned.csv', index=False)

In [28]:
df_s = df.sample(n=200000, random_state=1)

In [29]:
df_s = df_s.reset_index(drop=True)

In [31]:
path = os.getcwd()
data_dir = Path(path, 'generated_data')
os.mkdir(data_dir)

In [37]:
train, test = train_test_split(df_s, test_size=0.33, random_state=1, shuffle=True)

In [38]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [40]:
train_path = Path(data_dir, 'train.csv')
test_path = Path(data_dir, 'test.csv')

In [41]:
train.to_csv(train_path, sep=',', index=False)
test.to_csv(test_path, sep=',', index=False)