In [137]:
import pandas as pd
import math

In [150]:
df = pd.read_csv("US_Accidents_Dec20_updated_cleaned_imputed.csv")

## We already transformed these columns into individual hour, day, month values

In [152]:
df.drop(["Start_Time", "End_Time"], axis=1, inplace=True)

## We drop the distance metric because it is calculated after the accident occurs and would lead to data leakage, and we wouldn't have access to that data beforehand

In [153]:
df.drop(["Distance(mi)"], axis=1, inplace=True)

## We multiply the time columns by the sine and cosine functions, with a period equal to the range of values the columns take (24 for hour of day, 7 for day of week, 12 for month of year)

In [154]:
month_of_year_sin = df["Month_Of_Year"].apply(lambda month: math.sin(2*math.pi*month/12))
day_of_week_sin = df["Day_Of_Week"].apply(lambda day: math.sin(2*math.pi*day/7))
hour_of_the_day_sin = df["Hour_Of_Day"].apply(lambda hour: math.sin(2*math.pi*hour/24))

month_of_year_cos = df["Month_Of_Year"].apply(lambda month: math.cos(2*math.pi*month/12))
day_of_week_cos = df["Day_Of_Week"].apply(lambda day: math.cos(2*math.pi*day/7))
hour_of_the_day_cos = df["Hour_Of_Day"].apply(lambda hour: math.cos(2*math.pi*hour/24))

In [155]:
df.drop(["Month_Of_Year", "Day_Of_Week", "Hour_Of_Day"], axis=1, inplace=True)

In [156]:
df = pd.concat([df, month_of_year_sin, day_of_week_sin, hour_of_the_day_sin], axis=1)
df.rename(columns={"Month_Of_Year": "Month_Of_Year_sin",
                   "Day_Of_Week": "Day_Of_Week_sin",
                   "Hour_Of_Day": "Hour_Of_Day_sin"
                   },
                   inplace=True)

df = pd.concat([df, month_of_year_cos, day_of_week_cos, hour_of_the_day_cos], axis=1)
df.rename(columns={"Month_Of_Year": "Month_Of_Year_cos",
                   "Day_Of_Week": "Day_Of_Week_cos",
                   "Hour_Of_Day": "Hour_Of_Day_cos"
                   },
                   inplace=True)

## Here we one-hot encode the state column, wind direction, and weather condition

In [158]:
state_one_hot = pd.get_dummies(df["State"])
df.drop(["State"], axis=1, inplace=True)
df = pd.concat([df, state_one_hot], axis=1)

In [160]:
wind_direction_one_hot = pd.get_dummies(df["Wind_Direction"])
df.drop(["Wind_Direction"], axis=1, inplace=True)
df = pd.concat([df, wind_direction_one_hot], axis=1)

In [162]:
weather_condition_one_hot = pd.get_dummies(df["Weather_Condition"])
df.drop(["Weather_Condition"], axis=1, inplace=True)
df = pd.concat([df, weather_condition_one_hot], axis=1)

In [165]:
df.to_csv("US_Accidents_Dec20_updated_cleaned_imputed_data_prepared.csv", index=False, header=True)