# Imports

In [76]:
# File managment:
import zipfile
import os
from google.colab import drive

# Common:
import pandas as pd
import numpy as np

# Visualization
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data

Read Data

In [77]:
# Get CSV from drive
drive.mount('/content/drive')
file_path = f'/content/drive/Shared drives/Visualization Project (Anton and Omri)/data/US_Accidents_March23_sampled_500k.csv' # Small Sample
# file_path = f'/content/drive/Shared drives/Visualization Project (Anton and Omri)/data/US_Accidents_March23.csv' # All Data
data = pd.read_csv(file_path, low_memory=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Create Data & Time Columns

This code casts the data in Start_Time and End_Time columns to be of type DateTime and breacks down the columns Start_Time to 4 more columns to represent: Year,Month,Day,Hour.

In [78]:
# Remove milliseconds from datetime strings (if present)
data['Start_Time'] = data['Start_Time'].apply(lambda x: x.split('.')[0] if '.' in x else x)
data['End_Time'] = data['End_Time'].apply(lambda x: x.split('.')[0] if '.' in x else x)
# Convert the datetime columns to datetime object
data['Start_Time'] = pd.to_datetime(data['Start_Time'], format='%Y-%m-%d %H:%M:%S')
data['End_Time'] = pd.to_datetime(data['End_Time'], format='%Y-%m-%d %H:%M:%S')

# Create Date & Time Features
data['Year'] = data['Start_Time'].dt.year
data['Month'] = data['Start_Time'].dt.month
data['Day'] = data['Start_Time'].dt.day
data['Hour'] = data['Start_Time'].dt.hour

Add holiday data

In [79]:
def defin_holidy(row):
  date = row['Date']

  # New Years
  if (date.month == 12 and date.day == 31) or (date.month == 1 and date.day == 1):
    return "New Year's"

  # Independence Day
  elif date.month == 7 and date.day == 4:
    return "Independence Day"

  # Thanksgiving
  elif str(date) in ['2017-11-23', '2017-11-24', '2018-11-22', '2018-11-23','2019-11-28', '2019-11-29', '2020-11-26', '2020-11-27', '2022-11-24', '2022-11-25']:
    return "Thanksgiving"

  # Christmas
  elif date.month == 12 and date.day == 25:
    return "Christmas"

  else:
    return "Non Holiday"

In [80]:
# Copy relevenet columns
df = data[["Start_Time","Year","Day","Sunrise_Sunset"]].copy()

# Extract the date part from 'Start_Time'
df['Date'] = df['Start_Time'].dt.date

# Create holidy column
df['Holiday'] = df.apply(defin_holidy, axis=1)

# df['Year'] = df['Start_Time'].dt.year
# df['Month'] = df['Start_Time'].dt.month
# df['Day'] = df['Start_Time'].dt.day
# df['Hour'] = df['Start_Time'].dt.hour

# df[(df['Year'] == 2020) & (df['Month'] == 12) & (df['Day'] == 31)].sort_values(by='Start_Time')




In [81]:
# # Example dictionary mapping holidays to the number of days they span in a year
# holiday_days = {
#     "New Year's": 1,
#     'Independence Day': 1,
#     'Thanksgiving': 2,
#     'Christmas': 1,
#     'Non Holiday': 365 - 5
# }

# # Grouping and calculating the accident counts
# accident_counts = df.groupby(['Year', "Day", 'Holiday', 'Sunrise_Sunset']).size().reset_index(name='Accident_Count')
# accident_daily_avg = accident_counts.groupby(['Year', 'Holiday', 'Sunrise_Sunset'])['Accident_Count'].mean().reset_index(name='Accident_Rate')

# # # Normalizing by the number of days each holiday spans
# # accident_counts['Days'] = accident_counts['Holiday'].map(holiday_days)
# # accident_counts['Accident_Rate'] = accident_counts['Accident_Count'] / accident_counts['Days']

In [87]:
# Grouping by Year, Holiday, Sunrise_Sunset and calculating daily accident count and number of days
df_grouped = df.groupby(['Year', 'Holiday', 'Sunrise_Sunset']).agg(
    Daily_Accident_Count=('Date', 'size'),
    Number_of_Days=('Date', 'nunique')
).reset_index()

# Calculating average daily accident count
df_grouped['Average_Daily_Accident_Count'] = df_grouped['Daily_Accident_Count'] / df_grouped['Number_of_Days']
df_grouped

Unnamed: 0,Year,Holiday,Sunrise_Sunset,Daily_Accident_Count,Number_of_Days,Average_Daily_Accident_Count
0,2016,Christmas,Day,12,1,12.000000
1,2016,Christmas,Night,17,1,17.000000
2,2016,Independence Day,Day,65,1,65.000000
3,2016,Independence Day,Night,21,1,21.000000
4,2016,New Year's,Day,25,1,25.000000
...,...,...,...,...,...,...
65,2022,Thanksgiving,Night,35,1,35.000000
66,2023,New Year's,Day,107,1,107.000000
67,2023,New Year's,Night,188,1,188.000000
68,2023,Non Holiday,Day,9217,63,146.301587


In [88]:

# Filtering the DataFrame
df_grouped = df_grouped[(df_grouped['Year'] != 2023) & (df_grouped['Year'] != 2016)]

# Creating the combined accident rate DataFrame
df_combined = df_grouped.groupby(['Year', 'Holiday'])['Average_Daily_Accident_Count'].sum().reset_index()

# Creating separate dataframes for Day and Night
df_day = df_grouped[df_grouped['Sunrise_Sunset'] == 'Day'].groupby(['Year', 'Holiday'])['Average_Daily_Accident_Count'].sum().reset_index()
df_night = df_grouped[df_grouped['Sunrise_Sunset'] == 'Night'].groupby(['Year', 'Holiday'])['Average_Daily_Accident_Count'].sum().reset_index()

# Creating subplots
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "Total Accident Rate for Each Holiday Across Different Years",
        "Day Accident Rates",
        "Night Accident Rates"
    ),
    specs=[[{"type": "bar", "colspan": 2}, None], [{"type": "bar"}, {"type": "bar"}]],
    shared_xaxes=True
)

# Define specific colors for each holiday
holiday_colors = {
    "New Year's": "#aa6f73",
    'Independence Day': "#a39193",
    'Thanksgiving': "#eea990",
    'Christmas': "#f6e0b5",
    'Non Holiday': "#66545e"
}

# Adding the combined accident rates bar plot with specific colors
for holiday in df_combined['Holiday'].unique():
    holiday_data = df_combined[df_combined['Holiday'] == holiday]
    fig.add_trace(
        go.Bar(name=holiday, x=holiday_data['Year'], y=holiday_data['Average_Daily_Accident_Count'], legendgroup=holiday, marker_color=holiday_colors[holiday]),
        row=1, col=1
    )

# Adding the day accident rates bar plot with specific colors
for holiday in df_day['Holiday'].unique():
    holiday_data = df_day[df_day['Holiday'] == holiday]
    fig.add_trace(
        go.Bar(name=holiday, x=holiday_data['Year'], y=holiday_data['Average_Daily_Accident_Count'], legendgroup=holiday, showlegend=False, marker_color=holiday_colors[holiday]),
        row=2, col=1
    )

# Adding the night accident rates bar plot with specific colors
for holiday in df_night['Holiday'].unique():
    holiday_data = df_night[df_night['Holiday'] == holiday]
    fig.add_trace(
        go.Bar(name=holiday, x=holiday_data['Year'], y=holiday_data['Average_Daily_Accident_Count'], legendgroup=holiday, showlegend=False, marker_color=holiday_colors[holiday]),
        row=2, col=2
    )

# Updating layout
fig.update_layout(
    height=800,
    title_text="Accident Rates by Holiday and Time of Day",
    barmode='group',
    template='seaborn'  # Change this to any theme you prefer, e.g., 'plotly', 'plotly_white', etc.
)

# Showing the plot
fig.show()

# Questions and Visualizations

## Question 1 - Can we use the coordiante data to answer location based questions ?

Sub Questions:
1. Are there roads that their traffic is heavly affected by accidents ?
2. Are there locationas that are prone to accidents during specific hours of the day ?

### Chosen Visualization - US Map

#### Visualization Funcction

In [None]:
def map_viz(df,color_feature):
    ######################################################################### Color #########################################################################
    color_features = {
        "Traffic Affect Severity": "Severity",
        "Hour": "Hour",
        "Day or Night": "Civil_Twilight",
    }

    color_feature = color_features[color_feature]

    if color_feature == 'Severity':
        color_discrete_map = {
            '1': '#28B463',
            '2': '#3498DB',
            '3': '#E74C3C',
            '4': '#17202A'
        }
        df['Severity'] = df['Severity'].astype(str)

    elif color_feature == "Hour":
        color_scale = ['#000000', '#00001a', '#000033', '#00004d', '#000066', '#990000', '#e62e00', '#ff8000',
                       '#ffb31a', '#66ccff', '#1ab2ff', '#0099ff']

        color_scale = color_scale + color_scale[::-1]

        color_scale = ["#000000","#00E1E5","#F40000","#000000"]
        color_discrete_map = None

    elif color_feature == "Civil_Twilight":
        color_discrete_map = {
            'Day': "#FFB200",
            'Night': "#000000"
        }


    if color_feature in ['Civil_Twilight', 'Severity']:
        fig = px.scatter_mapbox(
            df,
            lat='Start_Lat',
            lon='Start_Lng',
            color=color_feature,
            color_discrete_map=color_discrete_map,
            size_max=15,
            zoom=3,
            mapbox_style="carto-positron",
            title="Geospatial Visualization of Accidents",
            hover_data={'State': True, 'Country': True, 'City': True, 'Street': True},
            opacity=0.5
        )
    else:
        fig = px.scatter_mapbox(
            df,
            lat='Start_Lat',
            lon='Start_Lng',
            color=color_feature,
            color_continuous_scale=color_scale,
            size_max=15,
            zoom=3,
            mapbox_style="carto-positron",
            title="Geospatial Visualization of Accidents",
            hover_data={'State': True, 'Country': True, 'City': True, 'Street': True},
            opacity=1
        )


    # Update the layout of the figure
    fig.update_layout(
        width=1400,
        height=800,
        template='seaborn',
        font=dict(family='Arial', size=12, color='black'),  # Set font style
    )

    fig.show()

#### Visualization

In [None]:

map_viz(data,"Traffic Affect Severity")
# map_viz(data,"Day or Night").show()