In [2]:
import pandas as pd

df = pd.read_csv("US_Accidents_March23.csv")   
print(df.shape)
df.head()

(7728394, 46)


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


In [5]:
import pandas as pd
import numpy as np
# from sklearn.impute import IterativeImputer

US_STATES = {
    "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA",
    "ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK",
    "OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC"
}

def clean_us_accidents(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Remove whitespaces in strings
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    for c in obj_cols:
        df[c] = df[c].astype("string").str.strip()
        
    # Drop rows with duplicated IDs
    if "ID" in df.columns:
        df["ID"] = df["ID"].astype("string")
        df = df.drop_duplicates(subset=["ID"])
        
    # Remove redundant columns
    COLS_TO_DROP = ["ID", "Source", "Zipcode", "Timezone", "Airport_Code", "End_Lat", "End_Lng"]
    df = df.drop(columns=COLS_TO_DROP, errors="ignore")
    
    # Replace wrong/missing values as NaT
    for c in ["Start_Time", "End_Time", "Weather_Timestamp"]:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")

    # Remove rows with missing start time values
    df = df[df["Start_Time"].notna()]
    
    def remove_out_of_bounds_values(col, lo=None, hi=None):
        if col not in df.columns:
            return
        
        df[col] = pd.to_numeric(df[col], errors="coerce")
        if lo is not None:
            df.loc[df[col] < lo, col] = np.nan
        if hi is not None:
            df.loc[df[col] > hi, col] = np.nan
        
            
    # Remove out of bounds values for severity
    remove_out_of_bounds_values("Severity", 1, 4)
    df["Severity"] = df["Severity"].round().astype("Int64")

    # Remove out of bounds values for coordinates
    remove_out_of_bounds_values("Start_Lat", -90, 90)
    remove_out_of_bounds_values("Start_Lng", -180, 180)
    
    # Remove missing Start_Lat or Start_Lng rows
    df = df[df["Start_Lat"].notna() & df["Start_Lng"].notna()]
    
    # Set all in State to capital letters and set states which are not in US_STATES to NA
    df["State"] = df["State"].astype("string").str.upper()
    df.loc[~df["State"].isin(US_STATES), "State"] = pd.NA
    df["State"] = df["State"].astype("category")
    
    WEATHER_BOUNDS = {
        "Temperature(F)": (-70, 130),
        "Wind_Chill(F)": (-50, 130),
        "Humidity(%)": (0, 100),
        "Pressure(in)": (15, 32),
        "Visibility(mi)": (0, 100),
        "Wind_Speed(mph)": (0, 200),
        "Precipitation(in)": (0, 50),
    }

    # Apply bounds to weather data
    for col, (lo, hi) in WEATHER_BOUNDS.items():
        remove_out_of_bounds_values(col, lo, hi)

    weather_num = [c for c in WEATHER_BOUNDS.keys() if c in df.columns]
    median_sample_frac = 0.02
    median_min_samples = 50000

    # Fill missing values using sampled medians
    if len(weather_num) >= 1:
        weather_df = df[weather_num].apply(pd.to_numeric, errors="coerce")

        n = len(weather_df)
        # Choose sample
        sample_n = int(min(n, max(median_min_samples, round(median_sample_frac * n))))

        if sample_n < n:
            sample_df = weather_df.sample(n=sample_n, random_state=0)
        else:
            sample_df = weather_df

        medians = sample_df.median(skipna=True)

        # fill missing data in dataset with sampled medians
        df[weather_num] = weather_df.fillna(medians)

    # Standarize wind directions
    df["Wind_Direction"] = df["Wind_Direction"].astype("string").str.strip().str.upper()
    df["Wind_Direction"] = df["Wind_Direction"].replace({
        "VARIABLE": "VAR",
        "VAR": "VAR",
        "CALM": "CALM",
        "NORTH": "N",
        "SOUTH": "S",
        "EAST": "E",
        "WEST": "W",
    })
    df["Wind_Direction"] = df["Wind_Direction"].astype("category")

    # Normalise all the columns in twlight_cols to Day/Night/Unknown
    TWILIGHT_COLS = ["Sunrise_Sunset","Civil_Twilight","Nautical_Twilight","Astronomical_Twilight"]
    for c in TWILIGHT_COLS:
        df[c] = df[c].astype("string").str.strip().str.title()
        df.loc[~df[c].isin(["Day","Night"]), c] = "Unknown"
        df[c] = df[c].astype("category")

    BOOL_COLS = [
        "Amenity","Bump","Crossing","Give_Way","Junction","No_Exit","Railway",
        "Roundabout","Station","Stop","Traffic_Calming","Traffic_Signal","Turning_Loop"
    ]
    for c in BOOL_COLS:
        if c not in df.columns:
            continue
        
        # Handle the case where the boolean is a string
        if df[c].dtype.name in ["string", "object"]:
            df[c] = df[c].str.lower().map({"true": True, "false": False})
        df[c] = df[c].astype("boolean")

        # Assume all misisng boolean values to be false
        df[c] = df[c].fillna(False)

    return df



In [6]:
dt = clean_us_accidents(df)