In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import re

enc = LabelEncoder()

violations_df = pd.read_csv("./dataset/Traffic_Violations.csv")
violations_df.head()

In [None]:
violations_df.info()

<h1><b>SeqID</b></h1>

In [None]:
violations_df = violations_df.drop_duplicates(subset=['SeqID'], keep='first')
violations_df = violations_df.reset_index(drop=True)

violations_df["SeqID"] = violations_df["SeqID"].astype("string")

<h1><b>Date Of Stop</b></h1>

In [None]:
from utils.date_of_stop import clean_date_of_stop

violations_df["Date Of Stop"] = violations_df["Date Of Stop"].apply(clean_date_of_stop)
violations_df.dropna(subset=["Date Of Stop"], inplace=True)

<h1><b>Time Of Stop</b></h1>

In [None]:
from utils.time_of_stop import clean_time_of_stop

violations_df["Time Of Stop"] = violations_df["Time Of Stop"].apply(clean_time_of_stop)
violations_df.dropna(subset=["Time Of Stop"], inplace=True)

<h1><b>Agency</b></h1>

In [None]:
violations_df["Agency"] = violations_df["Agency"].str.upper()

<h1><b>SubAgency</b></h1>

In [None]:
violations_df["SubAgency"] = violations_df["SubAgency"].apply(lambda x: re.sub(r'\s*/\s*', '/', x))

<h1><b>Description</b></h1>

In [None]:
from utils.description import clean_description
violations_df["Description"] = violations_df["Description"].apply(clean_description)

<h1><b>Location</b></h1>

In [None]:
from utils.location import clean_location

violations_df["Location"] = violations_df["Location"].apply(clean_location)
violations_df.dropna(subset=["Location"], inplace=True)

<h1><b>Latitude & Longitude</b></h1>

In [None]:
from utils.coordinates import clean_latitude, clean_longitude

violations_df["Latitude"] = pd.to_numeric(violations_df["Latitude"], errors="coerce").astype("float64").round(6)
violations_df["Latitude"] = violations_df["Latitude"].apply(clean_latitude)
violations_df.dropna(subset=["Latitude"], inplace=True)

violations_df["Longitude"] = pd.to_numeric(violations_df["Longitude"], errors="coerce").astype("float64").round(6)
violations_df["Longitude"] = violations_df["Longitude"].apply(clean_longitude)
violations_df.dropna(subset=["Longitude"], inplace=True)


<h1><b>Accident, Belts, Personal Injury, Property Damage, Fatal, Commercial License, HAZMAT, Commercial Vehicle, Alcohol, Work Zone, Search Conducted</b></h1>

In [None]:
cols = ["Accident", "Belts", "Personal Injury", "Property Damage", "Fatal", "Commercial License", "HAZMAT", "Commercial Vehicle", "Alcohol", "Work Zone", "Search Conducted"]

mapping = {
        'y': 'true',
        'yes': 'true',
        'n': 'false',
        'no': 'false',
        '1': 'true',
        '0': 'false',
        'FALSE': 'false',
        '': 'false'
    }

for col in cols:
    violations_df[col] = violations_df[col].replace(mapping)
    violations_df.dropna(subset=[col], inplace=True)
    violations_df[col] = enc.fit_transform(violations_df[col])

<h1><b>Search Disposition</b></h1>

In [None]:
violations_df["Search Disposition"] = violations_df["Search Disposition"].apply(lambda x: "Not Applicable" if pd.isna(x) else str(x).title() if len(str(x)) > 3 else str(x))

violations_df["Search Disposition"] = enc.fit_transform(violations_df["Search Disposition"])

<h1><b>Search Outcome</b></h1>

In [None]:
violations_df.dropna(subset=["Search Outcome"], inplace=True)

violations_df["Search Outcome"] = violations_df["Search Outcome"].apply(lambda x: str(x).title())

violations_df["Search Outcome"] = enc.fit_transform(violations_df["Search Outcome"])

<h1><b>Search Reason</b></h1>

In [None]:
violations_df["Search Reason"] = violations_df["Search Reason"].apply(lambda x: "Other" if pd.isna(x) else str(x).title() if len(str(x)) > 3 else x)

violations_df["Search Reason"] = violations_df["Search Reason"].replace({"Probable Cause For Cds": "Probable Cause For CDS"})

violations_df["Search Reason"] = enc.fit_transform(violations_df["Search Reason"])