In [279]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import re

enc = LabelEncoder()

violations_df = pd.read_csv("./dataset/Traffic_Violations.csv")
violations_df.head()

Unnamed: 0,SeqID,Date Of Stop,Time Of Stop,Agency,SubAgency,Description,Location,Latitude,Longitude,Accident,...,Charge,Article,Contributed To Accident,Race,Gender,Driver City,Driver State,DL State,Arrest Type,Geolocation
0,52282e8c-f2e1-4bb5-8509-2d5e4f8da8ca,05/01/2023,23:11:00,MCP,"3rd District, Silver Spring",OPERATING UNREGISTERED MOTOR VEHICLE ON HIGHWAY,BRIGGS CHANEY RD @ COLUMIBA PIKE,0.0,0.0,No,...,13-401(b1),Transportation Article,False,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(0.0, 0.0)"
1,b66f253b-af29-4bc4-bb73-93755ca2a779,08/31/2023,16:41:00,MCP,"6th District, Gaithersburg / Montgomery Village",DRIVING TO DRIVE MOTOR VEHICLE ON HIGHWAY WITH...,OAKMONT AVE @ GROVEMONT CIR,39.097965,-77.15301,No,...,16-101(a1),Transportation Article,False,HISPANIC,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(39.097965, -77.15301)"
2,b66f253b-af29-4bc4-bb73-93755ca2a779,08/31/2023,16:41:00,MCP,"6th District, Gaithersburg / Montgomery Village",FAILURE TO DISPLAY REGISTRATION CARD UPON DEMA...,OAKMONT AVE @ GROVEMONT CIR,39.097965,-77.15301,No,...,13-409(b),Transportation Article,False,HISPANIC,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(39.097965, -77.15301)"
3,b66f253b-af29-4bc4-bb73-93755ca2a779,08/31/2023,16:41:00,MCP,"6th District, Gaithersburg / Montgomery Village",DRIVER OF MOTOR VEHICLE FOLLOWING VEHICLE CLOS...,OAKMONT AVE @ GROVEMONT CIR,39.097965,-77.15301,No,...,21-310(a),Transportation Article,False,HISPANIC,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(39.097965, -77.15301)"
4,b66f253b-af29-4bc4-bb73-93755ca2a779,08/31/2023,16:41:00,MCP,"6th District, Gaithersburg / Montgomery Village",FAILURE TO CONTROL VEH. SPEED ON HWY. TO AVOID...,OAKMONT AVE @ GROVEMONT CIR,39.097965,-77.15301,No,...,21-801(b),Transportation Article,False,HISPANIC,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(39.097965, -77.15301)"


In [280]:
violations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2070115 entries, 0 to 2070114
Data columns (total 43 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   SeqID                    object 
 1   Date Of Stop             object 
 2   Time Of Stop             object 
 3   Agency                   object 
 4   SubAgency                object 
 5   Description              object 
 6   Location                 object 
 7   Latitude                 float64
 8   Longitude                float64
 9   Accident                 object 
 10  Belts                    object 
 11  Personal Injury          object 
 12  Property Damage          object 
 13  Fatal                    object 
 14  Commercial License       object 
 15  HAZMAT                   object 
 16  Commercial Vehicle       object 
 17  Alcohol                  object 
 18  Work Zone                object 
 19  Search Conducted         object 
 20  Search Disposition       object 
 21  Search O

<h1><b>SeqID</b></h1>

In [281]:
violations_df = violations_df.drop_duplicates(subset=['SeqID'], keep='first')
violations_df = violations_df.reset_index(drop=True)

violations_df["SeqID"] = violations_df["SeqID"].astype("string")

<h1><b>Date Of Stop</b></h1>

In [282]:
from utils.date_of_stop import clean_date_of_stop

violations_df["Date Of Stop"] = violations_df["Date Of Stop"].apply(clean_date_of_stop)
violations_df.dropna(subset=["Date Of Stop"], inplace=True)

<h1><b>Time Of Stop</b></h1>

In [283]:
from utils.time_of_stop import clean_time_of_stop

violations_df["Time Of Stop"] = violations_df["Time Of Stop"].apply(clean_time_of_stop)
violations_df.dropna(subset=["Time Of Stop"], inplace=True)

<h1><b>Agency</b></h1>

In [284]:
violations_df["Agency"] = violations_df["Agency"].str.upper()

<h1><b>SubAgency</b></h1>

In [285]:
violations_df["SubAgency"] = violations_df["SubAgency"].apply(lambda x: re.sub(r'\s*/\s*', '/', x))

<h1><b>Description</b></h1>

In [286]:
from utils.description import clean_description
violations_df["Description"] = violations_df["Description"].apply(clean_description)

<h1><b>Location</b></h1>

In [287]:
from utils.location import clean_location

violations_df["Location"] = violations_df["Location"].apply(clean_location)
violations_df.dropna(subset=["Location"], inplace=True)

<h1><b>Latitude & Longitude</b></h1>

In [288]:
from utils.coordinates import clean_latitude, clean_longitude

violations_df["Latitude"] = pd.to_numeric(violations_df["Latitude"], errors="coerce").astype("float64").round(6)
violations_df["Latitude"] = violations_df["Latitude"].apply(clean_latitude)
violations_df.dropna(subset=["Latitude"], inplace=True)

violations_df["Longitude"] = pd.to_numeric(violations_df["Longitude"], errors="coerce").astype("float64").round(6)
violations_df["Longitude"] = violations_df["Longitude"].apply(clean_longitude)
violations_df.dropna(subset=["Longitude"], inplace=True)


<h1><b>Accident, Belts, Personal Injury, Property Damage, Fatal, Commercial License, HAZMAT, Commercial Vehicle, Alcohol, Work Zone, Search Conducted</b></h1>

In [289]:
cols = ["Accident", "Belts", "Personal Injury", "Property Damage", "Fatal", "Commercial License", "HAZMAT", "Commercial Vehicle", "Alcohol", "Work Zone", "Search Conducted"]

mapping = {
        'y': 'true',
        'yes': 'true',
        'n': 'false',
        'no': 'false',
        '1': 'true',
        '0': 'false',
        'FALSE': 'false',
        '': 'false'
    }

for col in cols:
    violations_df[col] = violations_df[col].replace(mapping)
    
    violations_df.dropna(subset=[col], inplace=True)

    violations_df[col] = enc.fit_transform(violations_df[col])

<h1><b>Search Disposition</b></h1>

In [290]:
violations_df["Search Disposition"] = violations_df["Search Disposition"].apply(lambda x: "Not Applicable" if pd.isna(x) else str(x).title() if len(str(x)) > 3 else str(x))

violations_df["Search Disposition"] = enc.fit_transform(violations_df["Search Disposition"])


<h1><b>Search Outcome</b></h1>

In [291]:
violations_df.dropna(subset=["Search Outcome"], inplace=True)

violations_df["Search Outcome"] = violations_df["Search Outcome"].apply(lambda x: str(x).title())

violations_df["Search Outcome"] = enc.fit_transform(violations_df["Search Outcome"])


<h1><b>Search Reason</b></h1>

In [292]:
violations_df["Search Reason"] = violations_df["Search Reason"].apply(lambda x: "Other" if pd.isna(x) else str(x).title() if len(str(x)) > 3 else x)

violations_df["Search Reason"] = violations_df["Search Reason"].replace({"Probable Cause For Cds": "Probable Cause For CDS"})

violations_df["Search Reason"] = enc.fit_transform(violations_df["Search Reason"])

<h1><b>Search Reason For Stop</b></h1>

In [293]:
violations_df.dropna(subset=["Search Reason For Stop"], inplace=True)

violations_df = violations_df[violations_df["Search Reason For Stop"].str.len() > 5]

violations_df["Search Reason For Stop"] = violations_df["Search Reason For Stop"].apply(lambda x: str(x).upper())

<h1><b>Search Type</b></h1>

In [294]:
violations_df["Search Type"] = violations_df["Search Type"].apply(lambda x: "Not Applicable" if pd.isna(x) else str(x).title() if len(str(x)) > 2 else x)

violations_df["Search Type"] = enc.fit_transform(violations_df["Search Type"])

<h1><b>Search Arrest Reason</b></h1>

In [295]:
violations_df["Search Arrest Reason"] = violations_df["Search Arrest Reason"].apply(lambda x: "Other" if pd.isna(x) else str(x).title() if len(str(x)) > 3 else x)

violations_df["Search Arrest Reason"] = enc.fit_transform(violations_df["Search Arrest Reason"])

<h1><b>State</b></h1>

In [296]:
violations_df.dropna(subset=["State"], inplace=True)

violations_df["State"] = violations_df["State"].apply(lambda x: str(x).upper())

<h1><b>VehicleType</b></h1>

In [297]:
violations_df = violations_df.rename(columns={"VehicleType": "Vehicle Type"})

mapping = {
    "18 - Police Vehicle" : "18 - Police(Non-Emerg)",
    "28 - Electric Bicycle": "28 - Other",
    "29 - Other": "29 - Unknown"
}

violations_df["Vehicle Type"] = violations_df["Vehicle Type"].replace(mapping)

violations_df["Vehicle Type"] = violations_df["Vehicle Type"].apply(lambda x: str(x).title())

violations_df[["Vehicle Code", "Vehicle Category"]] = violations_df["Vehicle Type"].str.split(" - ", n=1, expand=True)

violations_df["Vehicle Code"] = violations_df["Vehicle Code"].astype(int)

<h1><b>Year</b></h1>

In [298]:
violations_df["Year"] = pd.to_numeric(violations_df["Year"], errors="coerce").astype("Int64")

violations_df["Year"] = violations_df["Year"].apply(lambda x: x if 1960 <= x <= 2025 else None)

violations_df.dropna(subset=["Year"], inplace=True)

<h1><b>Make</b></h1>

In [299]:
from utils.make import mappings

violations_df["Make"] = violations_df["Make"].str.replace(r"[^A-Za-z]", "", regex=True)

violations_df["Make"] = violations_df["Make"].replace(mappings)

violations_df.dropna(subset=["Make"], inplace=True)

<h1><b>Color</b></h1>

In [300]:
mapping = {
    "BLUE, DARK" : "Dark Blue",
    "BLUE, LIGHT" : "Light Blue",
    "GREEN, LGT" : "Light Green",
    "GREEN, DK" : "Dark Green"
}

violations_df.dropna(subset=["Color"], inplace=True)

violations_df["Color"] = violations_df["Color"].replace(mapping)

violations_df["Color"] = violations_df["Color"].apply(lambda x: str(x).title())

violations_df["Color"] = enc.fit_transform(violations_df["Color"])

<h1><b>Violation Type</b></h1>

In [301]:
violations_df["Violation Type"] = enc.fit_transform(violations_df["Violation Type"])

<h1><b>Charge</b></h1>

In [302]:
violations_df = violations_df[violations_df["Charge"].str.len() > 3]

violations_df["Charge"] = violations_df["Charge"].apply(lambda x: str(x).upper())

<h1><b>Article</b></h1>

In [303]:
invalid_articles = ["Maryland Rules", "00", "BR", "1A"]

violations_df = violations_df[~violations_df["Article"].isin(invalid_articles)]

<h1><b>Contributed To Accident</b></h1>

In [304]:
violations_df["Contributed To Accident"] = violations_df["Contributed To Accident"].astype(bool)

<h1><b>Race</b></h1>

In [305]:
violations_df["Race"] = violations_df["Race"].apply(lambda x: str(x).upper())

violations_df["Race"] = enc.fit_transform(violations_df["Race"])

<h1><b>Gender</b></h1>

In [306]:
violations_df["Gender"] = violations_df["Gender"].replace({"U": "Unknown"})

violations_df["Gender"] = enc.fit_transform(violations_df["Gender"])

<h1><b>Driver State</b></h1>

In [307]:
violations_df["Driver State"] = violations_df["Driver State"].apply(lambda x: str(x).upper())

<h1><b>DL State</b></h1>

In [308]:
violations_df.dropna(subset=["DL State"], inplace=True)

violations_df["DL State"] = violations_df["DL State"].apply(lambda x: str(x).upper())

<h1><b>Arrest Type</b></h1>

In [309]:
violations_df[["Arrest Type Code", "Arrest Type Description"]] = violations_df["Arrest Type"].str.split(" - ", n=1, expand=True)

<h1><b>Geolocation</b></h1>

In [310]:
from utils.geolocation import format_and_clean_geolocation, validate_geolocation

violations_df["Geolocation"] = violations_df["Geolocation"].apply(format_and_clean_geolocation)

violations_df["Geolocation"] = violations_df.apply(validate_geolocation, axis=1)