In [None]:
# Import Dependencies
import os
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster

In [None]:
#Read and print Accidents.csv
US_Accidents_Path = Path("data/Accidents.csv")
US_Accidents_df = pd.read_csv(US_Accidents_Path)
US_Accidents_df.head()

In [None]:
#Read Licenced_Drivers_clean.csv file and make data frame
drivers_path = Path("data/Licensed_Drivers_clean.csv")
drivers_df = pd.read_csv(drivers_path)
drivers_df = drivers_df.set_index('State')['Drivers']
drivers_df.head()


In [None]:
#Calculating number of accidents in every state. Using only 2022.
accidents_2022_df = US_Accidents_df[US_Accidents_df.Year == 2022]
state_accidents = US_Accidents_df.groupby('State').size()

state_accidents.head()

In [None]:

# Calculate accidents per driver per year
state_accidents_per_driver = (state_accidents * 100) / drivers_df

# Create a combined DataFrame
df = pd.DataFrame({
    "Drivers": drivers_df, 
    "AccidentsPerDriverPerYear": state_accidents_per_driver
}).dropna()

# Plotting
plt.figure(figsize=(12, 10))
plt.scatter(df['Drivers'], df['AccidentsPerDriverPerYear'])

# Add state labels
for state, row in df.iterrows():
    drivers = row['Drivers']
    accidents_per_driver = row['AccidentsPerDriverPerYear']
    plt.text(drivers, accidents_per_driver, state, ha='right', va='bottom')

plt.xlabel('Drivers (log scale)')
plt.ylabel('Accidents per Driver per Year (%)')
plt.xscale('log')

plt.title("State Accidents per Driver in 2022")
plt.show()

#Plotting accidents for every state divided by number of driver in the state
# state_accidents_per_driver = (state_accidents * 100) / drivers_df

# df = pd.DataFrame({"Drivers": drivers_df, "AccidentsPerDriverPerYear": state_accidents_per_driver}).dropna()

# plt.figure(figsize=(16, 12))
# plt.scatter(df.Drivers, df.AccidentsPerDriverPerYear)

# for state in df.index:
#     drivers = state_drivers[state]
#     accidents_per_driver = state_accidents_per_driver[state]
#     plt.text(drivers, accidents_per_driver, state, ha='right', va='bottom')

# plt.xlabel('Drivers')
# plt.ylabel('AccidentsPerDriver')
# plt.xscale('log')

# plt.show()


In [None]:
#Convert Start_Time column to date/time format
US_Accidents_with_date = US_Accidents_df.copy()
US_Accidents_with_date['Start_Time'] = pd.to_datetime(US_Accidents_with_date['Start_Time'], format='ISO8601')
# Convert Start_Time to datetime format if it's not already
US_Accidents_with_date['End_Time'] = pd.to_datetime(US_Accidents_with_date['End_Time'],format='ISO8601' )

# Extract date without time and create a new column
US_Accidents_with_date['Date'] = US_Accidents_with_date['Start_Time'].dt.date

# Extract day of the week and create another new column
US_Accidents_with_date['Week_Day'] = US_Accidents_with_date['Start_Time'].dt.day_name()

US_Accidents_with_date.head()

In [None]:
#What days of the week accidents occur the most?
# Group by 'Day_of_Week' and count occurrences
day_counts = US_Accidents_with_date.groupby('Week_Day').size().reset_index(name='Count')

# Sort the results by day of the week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts['Week_Day'] = pd.Categorical(day_counts['Week_Day'], categories=day_order, ordered=True)
day_counts = day_counts.sort_values('Week_Day')

print(day_counts)

In [None]:
#Plot days of the week
day_counts.plot(x='Week_Day', y='Count', kind='bar', figsize=(10, 6), color='skyblue')
# Add titles and labels
plt.title('Accident Counts by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Accidents')

# Show the plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
roundabout_counts = US_Accidents_df["Roundabout"].value_counts()
print(roundabout_counts)

In [None]:
#Do roundabouts help with reducing accidents?
total_accidents = US_Accidents_with_date.shape[0]
roundabout_accidents = US_Accidents_with_date["Roundabout"].sum()
print(f"Number of accidents with roundabouts {roundabout_accidents}. Total accidents {total_accidents}")
percentage_roundabout = (roundabout_accidents / total_accidents) * 100
percentage_no_roundabout = ((total_accidents - roundabout_accidents)/total_accidents)*100
print (f"Percentage of accidents with roundabouts is: {percentage_roundabout}%")
print (f"Percentage of accidents without roundabouts is: {percentage_no_roundabout:.2f}%")


## 1. Weather and Environment
# Does weather affect accident frequency(rain, fog, snow, wind)?
# Grouping accidents by the state and certain point of time.
# Trying to see if during one month in one state there is a difference in accidents during rain/snow and clear conditions.



In [None]:
#Figuring out how weather conditions affect accident frequesncy
weather_grouped = US_Accidents_with_date.groupby(['Precipitation(in)', 'Date', 'State'])['ID'].count().reset_index(name='Accident_Count')
weather_grouped.head()



In [None]:
# Sort by the 'Date' column
weather_grouped_sorted = weather_grouped.sort_values(by='Date')
weather_grouped_sorted.head()


In [None]:
WA = weather_grouped_sorted[weather_grouped_sorted['State'] == 'WA']
WA.head()


In [None]:
# Assign color based on Precipitation(in) values: Blue if Precipitation is 0, Red if greater than 0
WA['Color'] = WA['Precipitation(in)'].apply(
    lambda x: 'blue' if x == 0 else 'red'
)

# Plot the bar chart for WA
plt.figure(figsize=(10, 6))

# Plot the bar chart, coloring bars based on precipitation
plt.bar(WA['Date'], WA['Accident_Count'], color=WA['Color'])

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Accident Count')
plt.title('Accidents in WA by Precipitation Levels')

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Show the plot
plt.show()


In [None]:
#Accidents during windy, rainy, foggy and snowy conditions
windy_conditions = US_Accidents_df[US_Accidents_df['Weather_Condition'].str.contains('Windy', na=False)]
rainy_conditions = US_Accidents_df[US_Accidents_df['Weather_Condition'].str.contains('Rain', na=False)]
foggy_conditions = US_Accidents_df[US_Accidents_df['Weather_Condition'].str.contains('Fog', na=False)]
snow_conditions = US_Accidents_df[US_Accidents_df['Weather_Condition'].str.contains('Snow', na=False)]

In [None]:
rainy_perc = (rainy_conditions["ID"].count() / US_Accidents_df["ID"].count())* 100
windy_perc = (windy_conditions["ID"].count() / US_Accidents_df["ID"].count())* 100
foggy_perc = (foggy_conditions["ID"].count() / US_Accidents_df["ID"].count())* 100
snow_perc = (snow_conditions["ID"].count() / US_Accidents_df["ID"].count())* 100

In [None]:
print(f"Occurance of accidents when its Raining : {rainy_perc:.2f}%")
print(f"Occurance of accidents when its Windy : {windy_perc:.2f}%")
print(f"Occurance of accidents when its Foggy : {foggy_perc:.2f}%")
print(f"Occurance of accidents when its Snowing : {snow_perc:.2f}%")

In [None]:
print(f"Average severity of accidents when its Windy : {windy_conditions['Severity'].mean():.2f}")
print(f"Average severity of accidents when its Rainy : {rainy_conditions['Severity'].mean():.2f}")
print(f"Average severity of accidents when its Foggy : {foggy_conditions['Severity'].mean():.2f}")
print(f"Average severity of accidents when its Snowing : {snow_conditions['Severity'].mean():.2f}")



## Visibility

In [None]:
#How visibility affects number of accidents
visibility_counts = US_Accidents_with_date.groupby('Visibility(mi)')['ID'].count().reset_index()
visibility_counts.columns = ['Visibility(mi)', 'Number of Accidents']
visibility_counts = visibility_counts.sort_values(by='Visibility(mi)')

# Create a bar chart for visibility counts
visibility_counts.plot(x='Visibility(mi)', y='Number of Accidents', kind='bar', figsize=(10, 6), color='skyblue')

# Add titles and labels
plt.title('Number of Accidents by Visibility')
plt.xlabel('Visibility (miles)')
plt.ylabel('Number of Accidents')

# Show the plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#Visibility chart logarithmic scale
visibility_counts.plot(x='Visibility(mi)', y='Number of Accidents', kind='bar', figsize=(10, 6), color='skyblue', logy=True)

# Add titles and labels
plt.title('Number of Accidents by Visibility (Log Scale)')
plt.xlabel('Visibility (miles)')
plt.ylabel('Number of Accidents (Log Scale)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
bins = [0, 0.3, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, float('inf')]
labels = ['0-0.3', '0.3-0.5', '0.5-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10', '10+']

US_Accidents_with_date['Visibility Category'] = pd.cut(US_Accidents_with_date['Visibility(mi)'], bins=bins, labels=labels, right=False)

visibility_accidents = US_Accidents_with_date.groupby('Visibility Category')['ID'].count().reset_index(name='Number of Accidents')

print(visibility_accidents)


In [None]:
#Polt visibility accidents
visibility_accidents.plot(x='Visibility Category', y='Number of Accidents', kind='bar', figsize=(10, 6), color='skyblue')
plt.title('Number of Accidents by Visibility Range')
plt.xlabel('Visibility Category')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:


#Exclude 10 miles visibility
US_Accidents_with_date['Visibility Category'] = pd.cut(
    US_Accidents_with_date['Visibility(mi)'], 
    bins=bins, 
    labels=labels, 
    right=False
)

# Group by visibility category and count accidents
visibility_accidents = US_Accidents_with_date.groupby('Visibility Category')['ID'].count().reset_index(name='Number of Accidents')

# Print the result
print(visibility_accidents)


In [None]:

# Bin the original data (including 10 miles)
US_Accidents_with_date['Visibility Category'] = pd.cut(
    US_Accidents_with_date['Visibility(mi)'], 
    bins=bins, 
    labels=labels, 
    right=False
)

# Group by visibility and calculate accidents
visibility_accidents = US_Accidents_with_date.groupby('Visibility Category')['ID'].count().reset_index(name='Number of Accidents')

# Smooth the data using a moving average
visibility_accidents['Smoothed Accidents'] = visibility_accidents['Number of Accidents'].rolling(window=3, center=True).mean()

# Plot both original and smoothed data
plt.figure(figsize=(10, 6))
plt.plot(
    visibility_accidents['Visibility Category'], 
    visibility_accidents['Number of Accidents'], 
    marker='o', label='Original Data'
)
plt.plot(
    visibility_accidents['Visibility Category'], 
    visibility_accidents['Smoothed Accidents'], 
    marker='o', label='Smoothed Data'
)

plt.title('Number of Accidents by Visibility Category', fontsize=14)
plt.xlabel('Visibility Category (miles)', fontsize=12)
plt.ylabel('Number of Accidents', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#Initiate map
map = folium.Map(location=[US_Accidents_with_date['Start_Lat'].mean(), US_Accidents_with_date['Start_Lng'].mean()], zoom_start=6)
# Add MarkerCluster to group points together
marker_cluster = MarkerCluster().add_to(map)
for idx, row in US_Accidents_with_date.iterrows():
    folium.Marker(
        location=[row['Start_Lat'], row['Start_Lng']],
        popup=f"Accident ID: {row['ID']}. Severity: {row['Severity']}",
        icon=folium.Icon(color='red', icon='info-sign')
    ).add_to(marker_cluster)
#Show the map
map