# Prerocessing Data


In [3]:
###############################################################################
# data_preprocessing.py
#
# Steps:
#   1) Load 5 CSV datasets: KSI, TMC, Toronto Police Collisions,
#      Env Canada Weather, ERA5 Weather
#   2) Inspect random samples
#   3) Filter all data to 2015–2020
#   4) Subset columns to only those needed for traffic/weather modeling
#   5) Normalize coordinates:
#       - KSI geometry -> (lon, lat)
#       - TMC rename "longitude"->"lon", "latitude"->"lat"
#       - Collisions: keep them as is (0,0 if missing, no geocoding)
#       - Env Canada rename "x"->"lon", "y"->"lat"
#       - ERA5 parse .geo -> (lon, lat)
#   6) Round timestamps to hourly
#   7) [Optional] Aggregate TMC from 15-min to hourly
#   8) [Optional] Basic Weather transformations (ERA5 Kelvin->Celsius, etc.)
#   9) Print final shapes, heads
###############################################################################

import pandas as pd
import numpy as np
import ast
from IPython.display import display, Markdown
from datetime import datetime, timedelta

###############################################################################
# 1) LOAD DATA
###############################################################################

def load_datasets():
    print("Loading datasets...")
    df_ksi = pd.read_csv("Motor Vehicle Collisions with KSI Data - 4326.csv")
    df_tmc = pd.read_csv("tmc_raw_data_2010_2019.csv")
    df_collisions = pd.read_csv("Traffic_Collisions_Toronto_data.csv")
    df_env = pd.read_csv("hourly_final.csv")  # Env Canada Weather
    df_era5 = pd.read_csv("ERA.csv")          # ERA5 from GEE
    return df_ksi, df_tmc, df_collisions, df_env, df_era5


###############################################################################
# 2) INSPECT SAMPLES
###############################################################################

def show_random_unique_rows(df, title, n=10, subset=None):
    """
    Displays n unique random rows from df for quick inspection.
    If subset is provided, ensures uniqueness first by those columns.
    """
    if subset:
        df = df.drop_duplicates(subset=subset)
    else:
        df = df.drop_duplicates()
    sample = df.sample(min(n, len(df)), random_state=42)
    display(Markdown(f"### {title} – {len(sample)} Unique Rows"))
    display(sample.reset_index(drop=True))

def inspect_data(df_ksi, df_tmc, df_collisions, df_env, df_era5):
    show_random_unique_rows(df_ksi, "KSI Dataset", n=10, subset=["STREET1","STREET2"])
    show_random_unique_rows(df_tmc, "TMC Dataset", n=10, subset=["location_name"])
    show_random_unique_rows(df_collisions, "Toronto Police Collision Dataset", n=10, subset=["Neighbourhood"])
    show_random_unique_rows(df_env, "Env Canada Weather Dataset", n=10)
    show_random_unique_rows(df_era5, "ERA5 GEE Weather Dataset", n=10)


###############################################################################
# 3) DATE/TIME FILTER
###############################################################################

def to_datetime_and_filter(df, date_col, start="2015-01-01", end="2020-12-31"):
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    df[date_col] = df[date_col].dt.tz_localize(None)  # remove tz if present
    mask = (df[date_col] >= pd.Timestamp(start)) & (df[date_col] <= pd.Timestamp(end))
    return df[mask].copy()


###############################################################################
# 4) SUBSET COLUMNS
###############################################################################

def subset_columns(df_ksi, df_tmc, df_collisions, df_env, df_era5):
    """
    Keep only the columns directly relevant to:
      - KSI (collision severity, environment factors)
      - TMC (traffic volumes)
      - Collisions (time, location, severity)
      - Env Canada (key weather variables)
      - ERA5 (key weather variables)
    Adjust as needed for your final model goals.
    """
    # --- KSI ---
    ksi_keep = [
        'DATE', 'TIME', 'geometry',  # For time + coords
        'ACCLASS', 'IMPACTYPE', 'VEHTYPE', 'PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE',
        'LIGHT', 'RDSFCOND', 'VISIBILITY', 'INJURY', 'FATAL_NO'
    ]
    df_ksi = df_ksi[[c for c in ksi_keep if c in df_ksi.columns]].copy()

    # --- TMC ---
    tmc_keep = ['count_date', 'start_time', 'end_time', 'longitude', 'latitude']
    tmc_volume_cols = [col for col in df_tmc.columns 
                       if any(prefix in col for prefix in ['n_appr_','s_appr_','e_appr_','w_appr_'])]
    df_tmc = df_tmc[tmc_keep + tmc_volume_cols].copy()

    # --- Collisions ---
    collisions_keep = [
        'OccurrenceDate', 'Neighbourhood',  # we won't geocode, just keep if needed for analysis
        'Longitude', 'Latitude',
        'Fatalities', 'Injury_Collisions', 'PD_Collisions'
    ]
    df_collisions = df_collisions[[c for c in collisions_keep if c in df_collisions.columns]].copy()

    # --- Env Canada ---
    env_keep = [
        'LOCAL_DATE', 'TEMP', 'WINDCHILL', 'PRECIP_AMOUNT',
        'RELATIVE_HUMIDITY', 'VISIBILITY', 'WEATHER_ENG_DESC',
        'x','y'
    ]
    df_env = df_env[[c for c in env_keep if c in df_env.columns]].copy()

    # --- ERA5 ---
    era5_keep = [
        'timestamp', 'temperature_2m', 'dewpoint_temperature_2m',
        'total_precipitation', 'u_component_of_wind_10m',
        'v_component_of_wind_10m', '.geo'
    ]
    df_era5 = df_era5[[c for c in era5_keep if c in df_era5.columns]].copy()

    return df_ksi, df_tmc, df_collisions, df_env, df_era5


###############################################################################
# 5) COORDINATE NORMALIZATION
###############################################################################

def parse_ksi_geometry(geom_str):
    """ Parse KSI geometry -> (lon, lat). """
    if pd.isna(geom_str):
        return pd.Series([None, None])
    try:
        geom_dict = ast.literal_eval(geom_str)
        coords = geom_dict.get("coordinates", None)
        if isinstance(coords, list) and len(coords) > 0:
            first_elem = coords[0]
            if (isinstance(first_elem, list) and len(first_elem) > 0 and
                isinstance(first_elem[0], list)):
                # Possibly a MultiLineString
                first_pair = first_elem[0]
            else:
                first_pair = first_elem
            if isinstance(first_pair, list) and len(first_pair) == 2:
                return pd.Series([float(first_pair[0]), float(first_pair[1])])
    except:
        pass
    return pd.Series([None, None])

def normalize_coordinates(df_ksi, df_tmc, df_collisions, df_env, df_era5):
    """
    - KSI: parse geometry -> (lon, lat)
    - TMC: rename (longitude->lon, latitude->lat)
    - Collisions: keep as is, rename (Longitude->lon, Latitude->lat) if present
    - Env Canada: rename (x->lon, y->lat) if present
    - ERA5: parse .geo -> (lon, lat)
    """

    # === KSI: parse geometry ===
    if 'geometry' in df_ksi.columns:
        df_ksi[['lon','lat']] = df_ksi['geometry'].apply(parse_ksi_geometry)
        df_ksi.drop(columns=['geometry'], inplace=True, errors='ignore')

    # === TMC: rename ===
    if 'longitude' in df_tmc.columns:
        df_tmc.rename(columns={'longitude':'lon','latitude':'lat'}, inplace=True)

    # === Collisions: just rename "Longitude","Latitude" => "lon","lat" if present
    if 'Longitude' in df_collisions.columns:
        df_collisions.rename(columns={'Longitude':'lon'}, inplace=True)
    if 'Latitude' in df_collisions.columns:
        df_collisions.rename(columns={'Latitude':'lat'}, inplace=True)
    # (We do NOT attempt geocoding for 0,0 or NaN in collisions here.)

    # === Env Canada: rename x->lon, y->lat if present
    if 'x' in df_env.columns and 'y' in df_env.columns:
        df_env.rename(columns={'x':'lon','y':'lat'}, inplace=True)

    # === ERA5: parse .geo -> (lon, lat)
    if '.geo' in df_era5.columns:
        def parse_era5_geo(geo_str):
            if pd.isna(geo_str):
                return pd.Series([None, None])
            try:
                geo_dict = ast.literal_eval(geo_str)
                coords = geo_dict.get("coordinates", None)
                if coords and len(coords) == 2:
                    return pd.Series([float(coords[0]), float(coords[1])])
            except:
                pass
            return pd.Series([None, None])

        df_era5[['lon','lat']] = df_era5['.geo'].apply(parse_era5_geo)
        df_era5.drop(columns=['.geo'], inplace=True)

    return df_ksi, df_tmc, df_collisions, df_env, df_era5


###############################################################################
# 6) TIME NORMALIZATION (ROUND TO HOUR)
###############################################################################

def round_all_times(df_ksi, df_tmc, df_collisions, df_env, df_era5):
    """
    Round all timestamps to hourly:
      - KSI: combine DATE + TIME -> 'datetime'
      - TMC: parse 'start_time'
      - Collisions: parse 'OccurrenceDate'
      - Env Canada: parse 'LOCAL_DATE'
      - ERA5: parse 'timestamp'
    """

    # --- KSI ---
    if 'DATE' in df_ksi.columns and 'TIME' in df_ksi.columns:
        def combine_ksi_time(row):
            if pd.isna(row['DATE']):
                return pd.NaT
            t_str = str(row['TIME']).zfill(4)  # e.g. 624 -> '0624'
            hh = int(t_str[:-2]) if len(t_str) >= 2 else 0
            mm = int(t_str[-2:]) if len(t_str) >= 2 else 0
            dt = row['DATE'] + pd.Timedelta(hours=hh, minutes=mm)
            return dt

        df_ksi['datetime'] = df_ksi.apply(combine_ksi_time, axis=1)
        df_ksi['datetime'] = pd.to_datetime(df_ksi['datetime'], errors='coerce')
        df_ksi['datetime'] = df_ksi['datetime'].dt.round('H')

    # --- TMC ---
    if 'start_time' in df_tmc.columns:
        df_tmc['start_time'] = pd.to_datetime(df_tmc['start_time'], errors='coerce')
        df_tmc['start_time'] = df_tmc['start_time'].dt.round('H')

    # --- Collisions ---
    if 'OccurrenceDate' in df_collisions.columns:
        df_collisions['OccurrenceDate'] = pd.to_datetime(df_collisions['OccurrenceDate'], errors='coerce')
        df_collisions['OccurrenceDate'] = df_collisions['OccurrenceDate'].dt.round('H')

    # --- Env Canada ---
    if 'LOCAL_DATE' in df_env.columns:
        df_env['LOCAL_DATE'] = pd.to_datetime(df_env['LOCAL_DATE'], errors='coerce')
        df_env['LOCAL_DATE'] = df_env['LOCAL_DATE'].dt.round('H')

    # --- ERA5 ---
    if 'timestamp' in df_era5.columns:
        df_era5['timestamp'] = pd.to_datetime(df_era5['timestamp'], errors='coerce')
        df_era5['timestamp'] = df_era5['timestamp'].dt.round('H')

    return df_ksi, df_tmc, df_collisions, df_env, df_era5


###############################################################################
# 7) (Optional) AGGREGATE TMC TO HOURLY
###############################################################################

def aggregate_tmc_hourly(df_tmc):
    """
    Converts TMC from 15-min intervals to hourly by summing volumes across
    each hour. This is optional but often useful if your weather & collisions
    are hourly.
    - Summarizes direction-based columns (cars, trucks, etc.).
    - Group by (lon, lat, 'hour_start').

    Returns df_tmc_hourly with columns:
        ['lon', 'lat', 'hour_start', <summed volume columns>].
    """
    if 'start_time' not in df_tmc.columns:
        print("Warning: TMC has no 'start_time' column, skipping aggregation.")
        return df_tmc

    # Create a new column 'hour_start' from 'start_time'
    df_tmc['hour_start'] = df_tmc['start_time']

    # Identify columns that hold numeric traffic volumes
    vol_cols = [
        col for col in df_tmc.columns 
        if any(prefix in col for prefix in ['n_appr_','s_appr_','e_appr_','w_appr_'])
    ]

    group_cols = ['lon','lat','hour_start']
    df_agg = df_tmc.groupby(group_cols, dropna=False)[vol_cols].sum().reset_index()

    # Example: rename group_cols if you'd like
    # df_agg.rename(columns={'hour_start':'datetime'}, inplace=True)

    return df_agg


###############################################################################
# 8) (Optional) Weather Transformations
###############################################################################

def transform_weather_era5(df_era5):
    """
    Example: Convert ERA5 temperature_2m from Kelvin to Celsius, etc.
    Create new columns like 'temp_c' and 'wind_speed'.
    """
    if 'temperature_2m' in df_era5.columns:
        df_era5['temp_c'] = df_era5['temperature_2m'] - 273.15

    if 'u_component_of_wind_10m' in df_era5.columns and 'v_component_of_wind_10m' in df_era5.columns:
        df_era5['wind_speed_10m'] = np.sqrt(
            df_era5['u_component_of_wind_10m']**2 + 
            df_era5['v_component_of_wind_10m']**2
        )

    return df_era5


###############################################################################
# MAIN
###############################################################################

def main():
    # ----------- 1) LOAD -----------
    df_ksi, df_tmc, df_collisions, df_env, df_era5 = load_datasets()
    print('Size of datasets: ', df_ksi.shape, df_tmc.shape, df_collisions.shape, df_env.shape, df_era5.shape)
    print('KSI columns: ', df_ksi.columns)
    print('TMC columns: ', df_tmc.columns)  
    print('Collisions columns: ', df_collisions.columns)
    print('Env Canada columns: ', df_env.columns)
    print('ERA5 columns: ', df_era5.columns)
    
    # ----------- 2) INSPECT -----------
    inspect_data(df_ksi, df_tmc, df_collisions, df_env, df_era5)

    # ----------- 3) FILTER 2015-2020 -----------
    df_ksi = to_datetime_and_filter(df_ksi, 'DATE')
    df_tmc = to_datetime_and_filter(df_tmc, 'count_date')
    df_collisions = to_datetime_and_filter(df_collisions, 'OccurrenceDate')
    df_env = to_datetime_and_filter(df_env, 'LOCAL_DATE')
    df_era5 = to_datetime_and_filter(df_era5, 'timestamp')

    # ----------- 4) SUBSET COLUMNS -----------
    df_ksi, df_tmc, df_collisions, df_env, df_era5 = subset_columns(
        df_ksi, df_tmc, df_collisions, df_env, df_era5
    )

    # ----------- 5) COORD NORMALIZATION -----------
    df_ksi, df_tmc, df_collisions, df_env, df_era5 = normalize_coordinates(
        df_ksi, df_tmc, df_collisions, df_env, df_era5
    )

    # ----------- 6) TIME NORMALIZATION -----------
    df_ksi, df_tmc, df_collisions, df_env, df_era5 = round_all_times(
        df_ksi, df_tmc, df_collisions, df_env, df_era5
    )

    # ----------- 7) (Optional) AGGREGATE TMC TO HOURLY -----------
    # If you want to combine TMC with hourly collisions/weather:
    df_tmc_hourly = aggregate_tmc_hourly(df_tmc)

    # ----------- 8) (Optional) WEATHER TRANSFORMATIONS -----------
    df_era5 = transform_weather_era5(df_era5)
    # Similarly, you could transform Env Canada if needed
    # e.g. parse TEMP flags or unify units with ERA5

    # ----------- PRINT RESULTS -----------
    print("\n===== FINAL DATAFRAMES =====")
    print(f"KSI shape: {df_ksi.shape}")
    print(f"TMC shape (raw 15-min or partial): {df_tmc.shape}")
    print(f"TMC hourly shape (aggregated): {df_tmc_hourly.shape}")
    print(f"Collisions shape: {df_collisions.shape}")
    print(f"Env Canada shape: {df_env.shape}")
    print(f"ERA5 shape: {df_era5.shape}")

    print("\n--- KSI HEAD ---")
    print(df_ksi.head())

    print("\n--- TMC HEAD (15-min intervals) ---")
    print(df_tmc.head())

    print("\n--- TMC HOURLY HEAD (aggregated) ---")
    print(df_tmc_hourly.head())

    print("\n--- Collisions HEAD (no geocoding) ---")
    print(df_collisions.head())

    print("\n--- Env Canada HEAD ---")
    print(df_env.head())

    print("\n--- ERA5 HEAD (with optional transforms) ---")
    print(df_era5.head())


if __name__ == "__main__":
    main()


Loading datasets...


  df_env = pd.read_csv("hourly_final.csv")  # Env Canada Weather


Size of datasets:  (18957, 50) (223817, 55) (499538, 19) (97782, 37) (526080, 10)
KSI columns:  Index(['_id', 'ACCNUM', 'DATE', 'TIME', 'STREET1', 'STREET2', 'OFFSET',
       'ROAD_CLASS', 'DISTRICT', 'ACCLOC', 'TRAFFCTL', 'VISIBILITY', 'LIGHT',
       'RDSFCOND', 'ACCLASS', 'IMPACTYPE', 'INVTYPE', 'INVAGE', 'INJURY',
       'FATAL_NO', 'INITDIR', 'VEHTYPE', 'MANOEUVER', 'DRIVACT', 'DRIVCOND',
       'PEDTYPE', 'PEDACT', 'PEDCOND', 'CYCLISTYPE', 'CYCACT', 'CYCCOND',
       'PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK',
       'TRSN_CITY_VEH', 'EMERG_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV',
       'REDLIGHT', 'ALCOHOL', 'DISABILITY', 'HOOD_158', 'NEIGHBOURHOOD_158',
       'HOOD_140', 'NEIGHBOURHOOD_140', 'DIVISION', 'geometry'],
      dtype='object')
TMC columns:  Index(['_id', 'count_id', 'count_date', 'location_name', 'longitude',
       'latitude', 'centreline_type', 'centreline_id', 'px', 'start_time',
       'end_time', 'n_appr_cars_r', 'n_appr_cars_t', 'n_appr_car

### KSI Dataset – 10 Unique Rows

Unnamed: 0,_id,ACCNUM,DATE,TIME,STREET1,STREET2,OFFSET,ROAD_CLASS,DISTRICT,ACCLOC,...,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,DIVISION,geometry
0,4447,1097712.0,2009-04-04,228,LIPPINCOTT ST,BLOOR ST W,,Major Arterial,Toronto and East York,,...,,,Yes,,79,University,79,University (79),D14,"{""coordinates"": [[-79.4099900003758, 43.665344..."
1,259,899126.0,2006-03-26,1614,CALEDONIA RD,NORMAN AVE,,Minor Arterial,Toronto and East York,,...,,,,,92,Corso Italia-Davenport,92,Corso Italia-Davenport (92),D13,"{""coordinates"": [[-79.4557899995631, 43.677744..."
2,11502,,2015-04-22,1402,3 RAINIER SQ,,4 m South of,Local,Scarborough,Private Driveway,...,Yes,,,,148,East L'Amoreaux,117,L'Amoreaux (117),D42,"{""coordinates"": [[-79.3030119995988, 43.794242..."
3,15991,,2019-10-05,1611,DUPONT ST,BEDFORD RD,,Minor Arterial,Toronto and East York,At Intersection,...,,,,,95,Annex,95,Annex (95),D53,"{""coordinates"": [[-79.4005079994135, 43.676299..."
4,15656,,2019-06-20,1315,REGENT PARK BLVD,,,Major Arterial,Toronto and East York,At Intersection,...,,,,,72,Regent Park,72,Regent Park (72),D51,"{""coordinates"": [[-79.3617430003326, 43.660287..."
5,16973,1000499000.0,2021-03-18,1056,GREENWIN VILLAGE RD,BATHURST ST,50 m East of,Collector,North York,At/Near Private Drive,...,Yes,,,,36,Newtonbrook West,36,Newtonbrook West (36),D32,"{""coordinates"": [[-79.4448350001569, 43.790754..."
6,6095,1180034.0,2010-07-16,1011,SPADINA AVE,ST ANDREW ST,,Major Arterial,Toronto and East York,,...,,,,,78,Kensington-Chinatown,78,Kensington-Chinatown (78),D52,"{""coordinates"": [[-79.3985930000871, 43.654345..."
7,8566,1315841.0,2012-08-12,951,CHURCH ST,CARLTON ST,,Major Arterial,Toronto and East York,At Intersection,...,Yes,Yes,,,168,Downtown Yonge East,75,Church-Yonge Corridor (75),D51,"{""coordinates"": [[-79.3793900004206, 43.661845..."
8,2426,990402.0,2007-08-27,1441,FINCH Aven E,LESLIE Stre,,Major Arterial,North York,At Intersection,...,,,,,49,Bayview Woods-Steeles,49,Bayview Woods-Steeles (49),D33,"{""coordinates"": [[-79.3680900003581, 43.790045..."
9,7158,1251049.0,2011-07-17,630,BREMNER Boul,VAN DE WATER Cres,,Minor Arterial,Toronto and East York,At Intersection,...,Yes,,,,165,Harbourfront-CityPlace,77,Waterfront Communities-The Island (77),D52,"{""coordinates"": [[-79.3889900000155, 43.640345..."


### TMC Dataset – 10 Unique Rows

Unnamed: 0,_id,count_id,count_date,location_name,longitude,latitude,centreline_type,centreline_id,px,start_time,...,w_appr_bus_t,w_appr_bus_l,n_appr_peds,s_appr_peds,e_appr_peds,w_appr_peds,n_appr_bike,s_appr_bike,e_appr_bike,w_appr_bike
0,65665,29519,2012-12-04,King St W / Stanley Ter,-79.410122,43.642417,2,13467925,2314.0,2012-12-04T07:30:00,...,7,0,0,0,0,2,0,0,0,9
1,42623,28013,2011-10-17,Comstock Rd / Pharmacy Ave,-79.294925,43.719318,2,13454827,941.0,2011-10-17T07:30:00,...,0,0,0,0,0,0,0,0,0,0
2,114332,32687,2015-07-20,Lake Shore Blvd W / Brown's Line / Thirty Eigh...,-79.539402,43.5931,2,13470669,,2015-07-20T07:30:00,...,3,2,5,4,0,1,0,0,4,8
3,95360,31459,2014-10-16,Windermere Ave / Annette St,-79.48391,43.659196,2,13465467,,2014-10-16T07:30:00,...,3,0,1,1,0,3,0,1,2,6
4,152610,35121,2016-11-03,Torbarrie Rd / Judy Sgro Ave,-79.523647,43.727535,2,20145216,,2016-11-03T07:30:00,...,0,0,0,0,0,0,0,0,0,0
5,165171,35931,2017-05-09,Indian Rd / High Park Blvd,-79.453168,43.645499,2,13467619,,2017-05-09T07:30:00,...,1,0,2,8,4,0,3,1,1,6
6,32162,27291,2011-05-02,Willowdale Ave / Bishop Ave / Finch Corridor Trl,-79.407872,43.783441,2,13445655,1640.0,2011-05-02T07:30:00,...,0,0,3,0,0,1,0,0,0,2
7,147353,34773,2016-11-09,Queen's Park Cres E / St Joseph St,-79.391135,43.664851,2,13464371,2318.0,2016-11-09T07:30:00,...,0,0,2,12,28,0,0,0,0,0
8,12082,25913,2010-05-18,Bloor St W / Lansdowne Ave,-79.442734,43.658338,2,13465512,326.0,2010-05-18T07:30:00,...,0,0,36,0,28,0,17,0,9,0
9,110891,32461,2015-05-06,Warden Ave / Clonmore Dr / Hollis Kalmar Park Trl,-79.273203,43.693282,2,13459016,,2015-05-06T07:30:00,...,0,0,2,2,2,2,0,0,0,0


### Toronto Police Collision Dataset – 10 Unique Rows

Unnamed: 0,X,Y,OBJECTID,EventUniqueId,OccurrenceDate,Month,Day_of_Week,Year,Hour,Division,Atom,Neighbourhood,Fatalities,Injury_Collisions,FTR_Collisions,PD_Collisions,Longitude,Latitude,ObjectId2
0,-8832802.0,5416546.0,211103,GO-20142589039,2014/07/28 04:00:00+00,July,Monday,2014,14,D54/D55,57,Broadview North (57),0,YES,NO,NO,-79.346409,43.683194,211120
1,-8843931.0,5417016.0,371464,GO-20158008179,2015/02/20 05:00:00+00,February,Friday,2015,11,D13,109,Caledonia-Fairbank (109),0,NO,NO,YES,-79.446384,43.686249,371222
2,0.0,0.0,69459,GO-20191146878,2019/06/20 04:00:00+00,June,Thursday,2019,20,D11,88,High Park North (88),0,NO,YES,NO,0.0,0.0,69871
3,-8828159.0,5413933.0,161859,GO-20208017470,2020/06/09 04:00:00+00,June,Tuesday,2020,13,D54/D55,70,South Riverdale (70),0,NO,NO,YES,-79.3047,43.66622,161631
4,-8840497.0,5431424.0,330701,GO-20158046140,2015/11/05 05:00:00+00,November,Thursday,2015,15,D32,36,Newtonbrook West (36),0,NO,NO,YES,-79.415536,43.779769,330003
5,-8856014.0,5421364.0,473327,GO-20148004896,2014/02/01 05:00:00+00,February,Saturday,2014,18,D23,4,Rexdale-Kipling (4),0,NO,NO,YES,-79.554925,43.714484,473037
6,-8837780.0,5413070.0,279912,GO-2020796549,2020/04/27 04:00:00+00,April,Monday,2020,17,D52,79,University (79),0,YES,NO,NO,-79.391125,43.660609,279055
7,-8846494.0,5419266.0,397302,GO-2021888119,2021/05/14 04:00:00+00,May,Friday,2021,0,D12,30,Brookhaven-Amesbury (30),0,NO,YES,YES,-79.469409,43.700861,397039
8,-8854640.0,5420960.0,462993,GO-20168056458,2016/12/15 05:00:00+00,December,Thursday,2016,14,D23,5,Elms-Old Rexdale (5),0,NO,NO,YES,-79.542589,43.711865,462430
9,0.0,0.0,22395,GO-2017417456,2017/03/07 05:00:00+00,March,Tuesday,2017,18,D43,136,West Hill (136),0,YES,NO,NO,0.0,0.0,22501


### Env Canada Weather Dataset – 10 Unique Rows

Unnamed: 0,x,y,LOCAL_DATE,STATION_PRESSURE,TEMP_FLAG,WINDCHILL,LOCAL_HOUR,RELATIVE_HUMIDITY,WIND_DIRECTION_FLAG,WIND_DIRECTION,...,LOCAL_DAY,PROVINCE_CODE,UTC_DATE,DEW_POINT_TEMP,TEMP,WINDCHILL_FLAG,VISIBILITY,RELATIVE_HUMIDITY_FLAG,HUMIDEX,VISIBILITY_FLAG
0,-79.4,43.666667,2018-11-23 17:00:00,100.79,,,17,60.0,M,,...,23,ON,2018-11-23T22:00:00,-6.0,0.8,,,,,
1,-79.4,43.666667,2024-06-04 15:00:00,99.96,,,15,56.0,,,...,4,ON,2024-06-04T20:00:00,16.9,26.4,,,,32.0,
2,-79.4,43.666667,2020-06-24 16:00:00,99.37,,,16,34.0,,,...,24,ON,2020-06-24T21:00:00,6.2,22.7,,,,,
3,-79.4,43.666667,2018-02-15 13:00:00,99.11,,,13,75.0,M,,...,15,ON,2018-02-15T18:00:00,3.8,7.9,,,,,
4,-79.4,43.666667,2014-05-12 17:00:00,100.44,,,17,37.0,M,,...,12,ON,2014-05-12T22:00:00,2.0,16.7,,,,,
5,-79.4,43.666667,2017-01-14 08:00:00,102.25,,,8,66.0,M,,...,14,ON,2017-01-14T13:00:00,-13.2,-7.9,,,,,
6,-79.4,43.666667,2025-02-15 22:00:00,99.7,,,22,88.0,,,...,15,ON,2025-02-16T03:00:00,-4.4,-2.7,,,,,
7,-79.4,43.666667,2017-03-28 12:00:00,100.16,,,12,69.0,M,,...,28,ON,2017-03-28T17:00:00,2.6,8.0,,,,,
8,-79.4,43.666667,2019-07-28 18:00:00,100.07,,,18,49.0,,,...,28,ON,2019-07-28T23:00:00,16.0,27.5,,,,32.0,
9,-79.4,43.666667,2015-02-08 19:00:00,100.12,,,19,84.0,M,,...,8,ON,2015-02-09T00:00:00,-12.9,-10.6,,,,,


### ERA5 GEE Weather Dataset – 10 Unique Rows

Unnamed: 0,system:index,dewpoint_temperature_2m,location,surface_pressure,temperature_2m,timestamp,total_precipitation,u_component_of_wind_10m,v_component_of_wind_10m,.geo
0,9_20190303T01,268.240677,Agincourt,100170.890625,270.709167,2019-03-03 01:00,2.066698e-05,2.189865,-3.053253,"{""type"":""Point"",""coordinates"":[-79.2939,43.7996]}"
1,4_20200706T14,291.267044,East York,100313.261719,296.455444,2020-07-06 14:00,8.583069e-07,-4.281723,0.109268,"{""type"":""Point"",""coordinates"":[-79.3017,43.665]}"
2,5_20151230T10,273.556503,Guildwood,100968.5625,275.845764,2015-12-30 10:00,0.0002257153,5.781815,3.189896,"{""type"":""Point"",""coordinates"":[-79.1845,43.7636]}"
3,5_20200510T02,269.01358,Guildwood,100428.4375,276.657867,2020-05-10 02:00,0.0,6.243011,-1.323898,"{""type"":""Point"",""coordinates"":[-79.1845,43.7636]}"
4,1_20201202T02,269.730377,North York,98513.074219,272.274414,2020-12-02 02:00,5.876273e-05,4.092887,-4.099899,"{""type"":""Point"",""coordinates"":[-79.5181,43.7731]}"
5,5_20151017T00,273.65033,Guildwood,100559.398438,281.55542,2015-10-17 00:00,0.000784798,7.381165,-4.987549,"{""type"":""Point"",""coordinates"":[-79.1845,43.7636]}"
6,6_20150111T13,261.082565,High Park,101304.058594,265.088928,2015-01-11 13:00,4.99934e-06,3.057251,3.38732,"{""type"":""Point"",""coordinates"":[-79.4309,43.6816]}"
7,3_20170814T08,288.141953,Etobicoke,99507.597656,291.820892,2017-08-14 08:00,0.0,0.243805,-0.874802,"{""type"":""Point"",""coordinates"":[-79.62050000000..."
8,4_20191107T13,270.301544,East York,100867.046875,273.623306,2019-11-07 13:00,0.00600183,-0.419662,-4.553178,"{""type"":""Point"",""coordinates"":[-79.3017,43.665]}"
9,4_20190805T01,289.511612,East York,100103.890625,295.110657,2019-08-05 01:00,0.0,0.110672,0.722921,"{""type"":""Point"",""coordinates"":[-79.3017,43.665]}"



===== FINAL DATAFRAMES =====
KSI shape: (5571, 16)
TMC shape (raw 15-min or partial): (122570, 50)
TMC hourly shape (aggregated): (42167, 47)
Collisions shape: (417795, 7)
Env Canada shape: (52355, 9)
ERA5 shape: (525850, 10)

--- KSI HEAD ---
            DATE  TIME           ACCLASS         IMPACTYPE  \
11303 2015-01-01   624  Non-Fatal Injury       Approaching   
11304 2015-01-01   624  Non-Fatal Injury       Approaching   
11305 2015-01-01   624  Non-Fatal Injury       Approaching   
11306 2015-01-02   949  Non-Fatal Injury  Turning Movement   
11307 2015-01-02   949  Non-Fatal Injury  Turning Movement   

                            VEHTYPE PEDESTRIAN CYCLIST AUTOMOBILE  \
11303     Automobile, Station Wagon        NaN     NaN        Yes   
11304     Automobile, Station Wagon        NaN     NaN        Yes   
11305                           NaN        NaN     NaN        Yes   
11306     Automobile, Station Wagon        NaN     NaN        Yes   
11307  Truck - Closed (Blazer, etc)  

  df_ksi['datetime'] = df_ksi['datetime'].dt.round('H')
  df_tmc['start_time'] = df_tmc['start_time'].dt.round('H')
  df_collisions['OccurrenceDate'] = df_collisions['OccurrenceDate'].dt.round('H')
  df_env['LOCAL_DATE'] = df_env['LOCAL_DATE'].dt.round('H')
  df_era5['timestamp'] = df_era5['timestamp'].dt.round('H')
