## Cleaning

In [1]:
import pandas as pd

base_path = "../datasets"
dataset = pd.read_parquet(f"{base_path}/sample_data.parquet", engine="pyarrow")

Merge issue date and violation time

In [2]:
# convert the date to the same format as the weather data
dataset['issue_date'] = pd.to_datetime(dataset["issue_date"], format="mixed")

# add M to the end of the time to read it from 12 hour format
dataset['violation_time'] = dataset["violation_time"].str.upper() + "M"

# replace the values that starts with 00 to 12
dataset['violation_time'] = dataset['violation_time'].str.replace(r'^00', '12', regex=True)

# convert the time to 24 hour format
dataset['violation_time'] = pd.to_datetime(dataset["violation_time"], format="%I%M%p", errors="coerce")

# combine the date and time
dataset['issue_date'] = pd.to_datetime(dataset["issue_date"].dt.strftime('%Y-%m-%d') + ' ' + dataset["violation_time"].dt.strftime('%H:%M:%S'))
dataset = dataset.drop(["violation_time"], axis=1)

Clean vehicle expiration date

In [3]:
initial_nan = dataset['vehicle_expiration_date'].isna().sum()
dataset['vehicle_expiration_date'] = pd.to_datetime(dataset['vehicle_expiration_date'], format='%Y%m%d', errors='coerce')
final_nan = dataset['vehicle_expiration_date'].isna().sum()

print(f"Percentage of initial NaNs: {initial_nan / len(dataset) * 100:.2f}%")
print(f"Percentage of final NaNs: {final_nan / len(dataset) * 100:.2f}%")
print(f"Percentage of NaNs removed: {(final_nan - initial_nan) / len(dataset) * 100:.2f}%")

Percentage of initial NaNs: 6.85%
Percentage of final NaNs: 60.22%
Percentage of NaNs removed: 53.37%


Clean first observation datetime

In [4]:
initial_nan = dataset['time_first_observed'].isna().sum()
dataset['time_first_observed'] = dataset["time_first_observed"].str.upper() + "M"
dataset['time_first_observed'] = dataset['time_first_observed'].str.replace(r'^00', '12', regex=True)
dataset['time_first_observed'] = pd.to_datetime(dataset['time_first_observed'], format='%I%M%p', errors='coerce')
final_nan = dataset['time_first_observed'].isna().sum()

print(f"Percentage of initial NaNs: {initial_nan / len(dataset) * 100:.2f}%")
print(f"Percentage of final NaNs: {final_nan / len(dataset) * 100:.2f}%")
print(f"Percentage of NaNs removed: {(final_nan - initial_nan) / len(dataset) * 100:.2f}%")

Percentage of initial NaNs: 93.40%
Percentage of final NaNs: 94.55%
Percentage of NaNs removed: 1.15%


In [5]:
# replace 0 with NaN
dataset['date_first_observed'] = dataset['date_first_observed'].replace('0', pd.NaT)
# replace 0001-01-03T12:00:00.000 with NaN
dataset['date_first_observed'] = dataset['date_first_observed'].replace('0001-01-03T12:00:00.000', pd.NaT)

initial_nan = dataset['date_first_observed'].isna().sum()
dataset['date_first_observed'] = pd.to_datetime(dataset['date_first_observed'], format='%Y%m%d', errors='coerce')
final_nan = dataset['date_first_observed'].isna().sum()

print(f"Percentage of initial NaNs: {initial_nan / len(dataset) * 100:.2f}%")
print(f"Percentage of final NaNs: {final_nan / len(dataset) * 100:.2f}%")
print(f"Percentage of NaNs removed: {(final_nan - initial_nan) / len(dataset) * 100:.2f}%")

Percentage of initial NaNs: 98.76%
Percentage of final NaNs: 98.76%
Percentage of NaNs removed: 0.00%


In [6]:
# merge the date and time
dataset['date_first_observed'] = pd.to_datetime(dataset["date_first_observed"].dt.strftime('%Y-%m-%d') + ' ' + dataset["time_first_observed"].dt.strftime('%H:%M:%S'))
dataset = dataset.drop(["time_first_observed"], axis=1)

Clean violation county

In [7]:
# translate the county names to the borough names
county_to_borough = {
    "BRONX": "BX", # Bronx
    "BX": "BX",
    "Bronx": "BX",
    "BRONX": "BX",
    "BK": "K", # Brooklyn known as Kings
    "K": "K",
    "Kings": "K",
    "KINGS": "K",
    "KING": "K",
    "Q": "Q", # Queens
    "QN": "Q",
    "Qns": "Q",
    "QUEEN": "Q",
    "QUEENS": "Q",
    "QNS": "Q",
    "QU": "Q",
    "NY": "NY", # Manhattan known as New York
    "MN": "NY",
    "MAN": "NY",
    "NEW Y": "NY",
    "NEWY": "NY",
    "NYC": "NY",
    "ST": "R", # Staten Island known as Richmond
    "R": "R",
    "Rich": "R",
    "RICH": "R",
    "RICHM": "R",
    "RC": "R",
    "MH": "NY",
    "MS": "NY",
    "N": "NY",
    "P": "NY",
    "PBX": "NY",
    "USA": "NY",
    "VINIS": "NY",
    "A": pd.NA,
    "F": pd.NA,
    "ABX": pd.NA,
    "108": pd.NA,
    "103": "R", # Staten Island zip code
    "00000": pd.NA,
    "K   F": pd.NA,
}

dataset['violation_county'] = dataset['violation_county'].map(county_to_borough)

borough_to_code = {
  'NY': 1,
  'BX': 2,
  'K': 3,
  'Q': 4,
  'R': 5
}

dataset['violation_county'] = dataset['violation_county'].map(borough_to_code)

In [9]:
dataset.head()

Unnamed: 0,summons_number,plate_id,registration_state,plate_type,issue_date,violation_code,vehicle_body_type,vehicle_make,issuing_agency,street_code1,...,unregistered_vehicle,vehicle_year,meter_number,feet_from_curb,violation_post_code,violation_description,no_standing_or_stopping_violation,hydrant_violation,double_parking_violation,DataYear
30356,5069663409,HSP9388,PA,PAS,2013-07-20 18:09:00,7,CP,DODGE,V,0,...,,1998,,0,,FAILURE TO STOP AT RED LIGHT,,,,2014
87255,7433097510,T466979C,NY,OMT,2013-07-29 08:06:00,21,SUBN,TOYOT,T,51090,...,,2013,,0,H -,21-No Parking (street clean),,,,2014
11675,1358115771,49722JG,NY,COM,2013-07-10 09:44:00,19,VAN,MAZDA,X,0,...,0.0,1995,-,0,,,,,,2014
91091,7780960298,K5669F,TN,PAS,2013-07-29 14:36:00,14,4DSD,DODGE,T,68020,...,,0,,0,35 -,14-No Standing,,,,2014
36443,5069699234,EEV3130,NY,PAS,2013-07-21 22:07:00,7,2DSD,ME/BE,V,0,...,,2004,,0,,FAILURE TO STOP AT RED LIGHT,,,,2014


In [28]:
dataset.to_parquet(f"{base_path}/sample_data_cleaned.parquet", engine="pyarrow")