In [10]:
import json
import math
from typing import Dict, Any, Optional, List
from pyspark import RDD

def clean_us_accidents_pyspark(
    rows: RDD[Dict[str, Any]],         
    header_cols: List[str],         
    output_dir: str,
) -> None:
    sc = rows.context

    US_STATES = {
        "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA",
        "ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK",
        "OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC"
    }

    WEATHER_BOUNDS = {
        "Temperature(F)": (-70.0, 130.0),
        "Wind_Chill(F)": (-50.0, 130.0),
        "Humidity(%)": (0.0, 100.0),
        "Pressure(in)": (15.0, 32.0),
        "Visibility(mi)": (0.0, 100.0),
        "Wind_Speed(mph)": (0.0, 200.0),
        "Precipitation(in)": (0.0, 50.0),
    }

    COLS_TO_DROP = {"ID", "Source", "Zipcode", "Timezone", "Airport_Code", "End_Lat", "End_Lng"}

    WIND_DIR_MAP = {
        "VARIABLE": "VAR",
        "VAR": "VAR",
        "CALM": "CALM",
        "NORTH": "N",
        "SOUTH": "S",
        "EAST": "E",
        "WEST": "W",
    }

    TWILIGHT_COLS = ["Sunrise_Sunset","Civil_Twilight","Nautical_Twilight","Astronomical_Twilight"]

    BOOL_COLS = [
        "Amenity","Bump","Crossing","Give_Way","Junction","No_Exit","Railway",
        "Roundabout","Station","Stop","Traffic_Calming","Traffic_Signal","Turning_Loop"
    ]

    def strip(s: Any) -> Optional[str]:
        if s is None:
            return None
        s = str(s).strip()
        return s if s != "" else None

    def to_float(x: Any) -> Optional[float]:
        sx = strip(x)
        if sx is None:
            return None
        return float(sx)

    def to_int_round(x: Any) -> Optional[int]:
        f = to_float(x)
        if f is None:
            return None
        return int(round(f))

    def in_bounds(v: Optional[float], lo: float, hi: float) -> Optional[float]:
        if v is None:
            return None
        if v < lo or v > hi:
            return None
        return v

    def normalise_twilight(v: Any) -> str:
        v = strip(v)
        if v is None:
            return "Unknown"
        v = v.title()
        return v if v in ("Day", "Night") else "Unknown"

    def parse_bool(v: Any) -> Optional[bool]:
        if v is None:
            return None
        if isinstance(v, bool):
            return v
        s = strip(v)
        if s is None:
            return None
        s = s.lower()
        if s == "true":
            return True
        if s == "false":
            return False
        return None

    def clean_row(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        # Trim all strings
        for k, v in list(row.items()):
            if isinstance(v, str):
                row[k] = v.strip()

        if strip(row.get("Start_Time")) is None:
            return None

        # Bound serverity from 1 to 4
        if "Severity" in row:
            severity = to_int_round(row.get("Severity"))
            if severity is None or severity < 1 or severity > 4:
                return None
            row["Severity"] = severity
        
        # Bound coordinates and remove rows with misisng coordinates
        lat = in_bounds(to_float(row.get("Start_Lat")), -90.0, 90.0)
        lng = in_bounds(to_float(row.get("Start_Lng")), -180.0, 180.0)
        if lat is None or lng is None:
            return None
        row["Start_Lat"] = lat
        row["Start_Lng"] = lng

        # Ensure that all rows have a valid US State
        if "State" in row:
            st = strip(row.get("State"))
            st = st.upper() if st else None
            row["State"] = st if (st in US_STATES) else None

        # Bound weather data
        for c, (lo, hi) in WEATHER_BOUNDS.items():
            if c in row:
                row[c] = in_bounds(to_float(row.get(c)), lo, hi)

        # Standardise wind direction
        if "Wind_Direction" in row:
            wd = strip(row.get("Wind_Direction"))
            wd = wd.upper() if wd else None
            row["Wind_Direction"] = None if wd is None else WIND_DIR_MAP.get(wd, wd)

        # Normalise twilight cols
        for c in TWILIGHT_COLS:
            if c in row:
                row[c] = normalise_twilight(row.get(c))

        # Assume all missing booleans to be false
        for c in BOOL_COLS:
            if c in row:
                b = parse_bool(row.get(c))
                row[c] = False if b is None else b

        # Drop all redundant cols
        for c in COLS_TO_DROP:
            row.pop(c, None)

        return row

    # Remove duplicated IDs
    def row_id(r: Dict[str, Any]) -> str:
        rid = strip(r.get("ID"))
        return rid if rid is not None else ""

    deduped = (
        rows
        .map(lambda r: (row_id(r), r))
        .reduceByKey(lambda a, b: a)
        .values()
    )

    cleaned = deduped.map(clean_row).filter(lambda r: r is not None).cache()

    # # Fill in missing weather data using median from sampled data
    # weather_cols = [c for c in WEATHER_BOUNDS.keys() if c in header_cols]
    # median_sample_frac= 0.02
    # median_min_samples = 50000
    
    # if weather_cols:
    #     sampled = (
    #         cleaned
    #         .sample(withReplacement=False, fraction=median_sample_frac, seed=1)
    #         .take(median_min_samples)
    #     )

    #     medians: Dict[str, Optional[float]] = {}
    #     for c in weather_cols:
    #         vals = []
    #         for r in sampled:
    #             v = r.get(c)
    #             if v is None:
    #                 continue
    #             fv = float(v)
    #             if math.isnan(fv):
    #                 continue
    #             vals.append(fv)

    #         if not vals:
    #             medians[c] = None
    #         else:
    #             vals.sort()
    #             n = len(vals)
    #             mid = n // 2
    #             medians[c] = vals[mid] if (n % 2 == 1) else (vals[mid - 1] + vals[mid]) / 2.0

    #     medians_bc = sc.broadcast(medians)

    #     def fill_weather_medians(r: Dict[str, Any]) -> Dict[str, Any]:
    #         m = medians_bc.value
    #         for c in weather_cols:
    #             if r.get(c) is None and m.get(c) is not None:
    #                 r[c] = float(m[c])
    #         return r

    #     cleaned = cleaned.map(fill_weather_medians)
    
    # Write to output directory
    cleaned.map(lambda r: json.dumps(r, ensure_ascii=False)).saveAsTextFile(output_dir)

In [18]:
from pyspark.sql import SparkSession
import csv

spark = SparkSession.builder.master("local[*]").appName("clean-us-accidents").getOrCreate()
sc = spark.sparkContext

lines = sc.textFile("US_Accidents_March23.csv", minPartitions=8)
header = lines.first()

def parse_csv_line(line: str):
    return next(csv.reader([line]))

header_cols = parse_csv_line(header)
rows_rdd = (
    lines.filter(lambda x: x != header)
         .map(parse_csv_line)
         .map(lambda vals: dict(zip(header_cols, vals)))
)

ConnectionRefusedError: [Errno 61] Connection refused

In [12]:
import shutil
import os

# Please change the output dir accoridngly
#shutil.rmtree("out/cleaned_json1")
clean_us_accidents_pyspark(
    rows=rows_rdd,
    header_cols=header_cols,
    output_dir="out/cleaned_jsonl",
)

ConnectionRefusedError: [Errno 61] Connection refused