In [3]:
import pandas as pd

df = pd.read_csv("US_Accidents_March23_Sampled.csv")   
print(df.shape)
df.head()

(500000, 46)


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-2047758,Source2,2,2019-06-12 10:10:56,2019-06-12 10:55:58,30.641211,-91.153481,,,0.0,...,False,False,False,False,True,False,Day,Day,Day,Day
1,A-4694324,Source1,2,2022-12-03 23:37:14.000000000,2022-12-04 01:56:53.000000000,38.990562,-77.39907,38.990037,-77.398282,0.056,...,False,False,False,False,False,False,Night,Night,Night,Night
2,A-5006183,Source1,2,2022-08-20 13:13:00.000000000,2022-08-20 15:22:45.000000000,34.661189,-120.492822,34.661189,-120.492442,0.022,...,False,False,False,False,True,False,Day,Day,Day,Day
3,A-4237356,Source1,2,2022-02-21 17:43:04,2022-02-21 19:43:23,43.680592,-92.993317,43.680574,-92.972223,1.054,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-6690583,Source1,2,2020-12-04 01:46:00,2020-12-04 04:13:09,35.395484,-118.985176,35.395476,-118.985995,0.046,...,False,False,False,False,False,False,Night,Night,Night,Night


In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

US_STATES = {
    "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA",
    "ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK",
    "OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC"
}

def clean_us_accidents(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Remove whitespaces in strings
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    for c in obj_cols:
        df[c] = df[c].astype("string").str.strip()
        
    # Drop rows with duplicated IDs
    if "ID" in df.columns:
        df["ID"] = df["ID"].astype("string")
        df = df.drop_duplicates(subset=["ID"])
        
    # Remove redundant columns
    cols_to_drop = ["ID", "Source", "Zipcode", "Timezone", "Airport_Code", "End_Lat", "End_Lng"]
    df = df.drop(columns=cols_to_drop, errors="ignore")
    
    # Replace wrong/missing values as NaT
    for c in ["Start_Time", "End_Time", "Weather_Timestamp"]:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")

    # Remove rows with missing start time values
    df = df[df["Start_Time"].notna()]
    
    def remove_out_of_bounds_values(col, lo=None, hi=None):
        if col not in df.columns:
            return
        
        df[col] = pd.to_numeric(df[col], errors="coerce")
        if lo is not None:
            df.loc[df[col] < lo, col] = np.nan
        if hi is not None:
            df.loc[df[col] > hi, col] = np.nan
        
            
    # Remove out of bounds values for severity
    remove_out_of_bounds_values("Severity", 1, 4)
    df["Severity"] = df["Severity"].round().astype("Int64")

    # Remove out of bounds values for coordinates
    remove_out_of_bounds_values("Start_Lat", -90, 90)
    remove_out_of_bounds_values("Start_Lng", -180, 180)
    
    # Remove missing Start_Lat or Start_Lng rows
    df = df[df["Start_Lat"].notna() & df["Start_Lng"].notna()]
    
    # Set all in State to capital letters and set states which are not in US_STATES to NA
    df["State"] = df["State"].astype("string").str.upper()
    df.loc[~df["State"].isin(US_STATES), "State"] = pd.NA
    df["State"] = df["State"].astype("category")
    
    weather_bounds = {
        "Temperature(F)": (-70, 130),
        "Wind_Chill(F)": (-50, 130),
        "Humidity(%)": (0, 100),
        "Pressure(in)": (15, 32),
        "Visibility(mi)": (0, 100),
        "Wind_Speed(mph)": (0, 200),
        "Precipitation(in)": (0, 50),
    }

    # Apply bounds to weather data
    for col, (lo, hi) in weather_bounds.items():
        remove_out_of_bounds_values(col, lo, hi)

    weather_num = [c for c in weather_bounds.keys() if c in df.columns]
    min_vals = np.array([weather_bounds[c][0] for c in weather_num], dtype=float)
    max_vals = np.array([weather_bounds[c][1] for c in weather_num], dtype=float)
    
    # Fill missing values
    if len(weather_num) >= 2:
        imp = IterativeImputer(
            random_state=0,
            max_iter=20,
            min_value=min_vals,
            max_value=max_vals
        )
        df[weather_num] = imp.fit_transform(df[weather_num])

    # Standarize wind directions
    df["Wind_Direction"] = df["Wind_Direction"].astype("string").str.strip().str.upper()
    df["Wind_Direction"] = df["Wind_Direction"].replace({
        "VARIABLE": "VAR",
        "VAR": "VAR",
        "CALM": "CALM",
        "NORTH": "N",
        "SOUTH": "S",
        "EAST": "E",
        "WEST": "W",
    })
    df["Wind_Direction"] = df["Wind_Direction"].astype("category")

    # Normalise all the columns in twlight_cols to Day/Night/Unknown
    twilight_cols = ["Sunrise_Sunset","Civil_Twilight","Nautical_Twilight","Astronomical_Twilight"]
    for c in twilight_cols:
        df[c] = df[c].astype("string").str.strip().str.title()
        df.loc[~df[c].isin(["Day","Night"]), c] = "Unknown"
        df[c] = df[c].astype("category")

    bool_cols = [
        "Amenity","Bump","Crossing","Give_Way","Junction","No_Exit","Railway",
        "Roundabout","Station","Stop","Traffic_Calming","Traffic_Signal","Turning_Loop"
    ]
    for c in bool_cols:
        if c not in df.columns:
            continue
        
        # Handle the case where the boolean is a string
        if df[c].dtype.name in ["string", "object"]:
            df[c] = df[c].str.lower().map({"true": True, "false": False})
        df[c] = df[c].astype("boolean")

        # Assume all misisng boolean values to be false
        df[c] = df[c].fillna(False)

    return df

