# USA Accidents

__External Dependencies__

In [45]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import dash_bootstrap_components as dbc
from dash import Dash, Input, Output, html, dcc
import plotly.express as px
import os
import dash

__Graph colors constants__

In [46]:
# graph colors
GRADE1COLOR = 'rgb(77, 175, 74)'
GRADE2COLOR = 'rgb(255, 255, 51)'
GRADE3COLOR = 'rgb(255, 127, 0)'
GRADE4COLOR = 'rgb(228, 26, 20)'

__Configuration Variables__

In [47]:
# population dataset path
Path_to_population_dataset = 'Dataset\\US_Population.csv'

# accident dataset path
Path_to_accident_dataset = 'Dataset\\US_Accidents_Sampled.csv'

# sampling factor (>=1)
Sampling_Factor = 100
Already_Sampled = True # for efficency the dataset is already sampled

__Load population Dataset__

In [48]:
# load dataset for US population
df_pop = pd.read_csv(Path_to_population_dataset)
df_pop.dropna()
df_pop.head()

Unnamed: 0,Year,AL,AK,AZ,AR,CA,CO,CT,DE,FL,...,SD,TN,TX,UT,VT,VA,WA,WV,WI,WY
0,2016,4860545,741522,6941072,2989918,39250017,5540545,3576452,952065,20612439,...,865454,6651194,27862596,3051217,624594,8411808,7288000,1831102,5778708,585501
1,2017,4874747,739786,7044008,3001345,39536653,5607154,3573880,961939,20928863,...,869666,6715984,28304596,3101833,623657,8470020,7423362,1818157,5790186,584910
2,2018,4887681,735139,7158024,3009733,39776830,5691287,3573297,971180,21244317,...,878698,6771631,28704330,3153550,624344,8517685,7535591,1804291,5807406,578668
3,2019,4903185,731158,7278717,3017804,39576757,5758736,3571520,981822,21538187,...,882235,6829174,29145505,3205958,624358,8565256,7614893,1792147,5822434,577601
4,2020,4921532,727890,7421401,3029887,39368078,5845526,3565287,990837,21899341,...,886667,6886834,29618533,3251617,623989,8616207,7693612,1778070,5837466,567025


__Load accident Dataset__

In [49]:
df_acc = pd.read_csv(Path_to_accident_dataset)

# remove unecessary columns
df_acc.drop(['Source', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Description', 'Airport_Code'], axis=1, inplace=True)

# remove null values
df_acc.dropna()



if(not(Already_Sampled)):
    number_of_samples = (df_acc.index)/Sampling_Factor
    df_acc = df_acc.sample(n = number_of_samples, random_state = 351)

# Convert 'Start_Time' and 'End_Time to datetime
df_acc['Start_Time'] = pd.to_datetime(df_acc['Start_Time'],format='ISO8601')
df_acc['End_Time'] = pd.to_datetime(df_acc['End_Time'],format='ISO8601')

df_acc['Year'] = df_acc['Start_Time'].dt.year

# remove year 2023 because it is incomplete
df_acc = df_acc[(df_acc['Start_Time'].dt.year != 2023)]

df_acc.head()

Unnamed: 0.1,Unnamed: 0,ID,Severity,Start_Time,End_Time,Distance(mi),Street,City,County,State,...,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Year
0,2909380,A-2919259,3,2018-02-28 19:10:31,2018-02-28 19:55:17,0.0,Carson St,Lakewood,Los Angeles,CA,...,False,False,False,False,False,Night,Night,Night,Day,2018
1,4123847,A-4154878,2,2022-11-22 16:09:03,2022-11-22 17:26:55,0.99,I-40 W,Nashville,Davidson,TN,...,False,False,False,False,False,Day,Day,Day,Day,2022
2,7006131,A-7055452,2,2020-06-24 20:28:00,2020-06-24 21:02:05,0.0,I-94 E,Minneapolis,Hennepin,MN,...,False,False,False,False,False,Day,Day,Day,Day,2020
3,1191012,A-1200791,2,2021-01-08 18:01:03,2021-01-08 18:47:15,0.0,S Choctaw Dr,Baton Rouge,East Baton Rouge,LA,...,False,False,False,True,False,Night,Night,Day,Day,2021
4,453540,A-453553,2,2017-05-11 08:05:29,2017-05-11 08:34:56,0.02,Meridian Ave N,Seattle,King,WA,...,False,False,False,True,False,Day,Day,Day,Day,2017


In [50]:
df_acc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67722 entries, 0 to 69999
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Unnamed: 0             67722 non-null  int64         
 1   ID                     67722 non-null  object        
 2   Severity               67722 non-null  int64         
 3   Start_Time             67722 non-null  datetime64[ns]
 4   End_Time               67722 non-null  datetime64[ns]
 5   Distance(mi)           67722 non-null  float64       
 6   Street                 67627 non-null  object        
 7   City                   67718 non-null  object        
 8   County                 67722 non-null  object        
 9   State                  67722 non-null  object        
 10  Zipcode                67705 non-null  object        
 11  Country                67722 non-null  object        
 12  Timezone               67660 non-null  object        
 13  Weathe

__First Graph (SingleBar)__

In [51]:
def SingleBarChart(df, TimeInterval, Sampling_Factor):

    if (TimeInterval == 'Yearly'):
        yearly_counts = (df['Start_Time'].dt.year.value_counts().sort_index())*Sampling_Factor
        fig = px.bar(
            x = yearly_counts.index, 
            y = yearly_counts.values,          
            labels = {'x': 'Year', 'y': 'Number of Accidents'},
            title = 'Number of Accidents per Year'
        )
        return fig
    
    if (TimeInterval == 'Monthly'):
        monthly_counts = (df['Start_Time'].dt.month.value_counts().sort_index())*Sampling_Factor
        fig = px.bar(
            x = monthly_counts.index, 
            y = monthly_counts.values,          
            labels = {'x': 'Month', 'y': 'Number of Accidents'},
            title = 'Number of Accidents per Month'
        )
        return fig
   
    if (TimeInterval == 'Daily'):
        daily_counts = (df['Start_Time'].dt.dayofweek.value_counts().sort_index())*Sampling_Factor
        day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        fig = px.bar(
            x = [day_names[day] for day in daily_counts.index], 
            y = daily_counts.values,          
            labels = {'x': 'Day of the Week', 'y': 'Number of Accidents'},
            title = 'Number of Accidents per Day'
        )
        return fig
    
    if (TimeInterval == 'Hourly'):
        hourly_counts = (df['Start_Time'].dt.hour.value_counts().sort_index())*Sampling_Factor
        fig = px.bar(
            x = hourly_counts.index, 
            y = hourly_counts.values,          
            labels = {'x': 'Hour', 'y': 'Number of Accidents'},
            title = 'Number of Accidents per Hour'
        )
        return fig

In [52]:
SingleBarChart(df_acc, 'Daily', Sampling_Factor).show()

__First graph (multi bar)__

In [53]:
def GrouBySeverity(df, TimeInterval, Severity):
    if (TimeInterval == 'Yearly'):
        categories = df['Start_Time'].dt.year.value_counts().sort_index().index
        values1 = Severity[0]['Start_Time'].dt.year.value_counts().sort_index()
        values2 = Severity[1]['Start_Time'].dt.year.value_counts().sort_index()
        values3 = Severity[2]['Start_Time'].dt.year.value_counts().sort_index()
        values4 = Severity[3]['Start_Time'].dt.year.value_counts().sort_index()
        return [categories,values1,values2,values3,values4]
    
    if (TimeInterval == 'Monthly'):
        categories = df['Start_Time'].dt.month.value_counts().sort_index().index
        values1 = Severity[0]['Start_Time'].dt.month.value_counts().sort_index()
        values2 = Severity[1]['Start_Time'].dt.month.value_counts().sort_index()
        values3 = Severity[2]['Start_Time'].dt.month.value_counts().sort_index()
        values4 = Severity[3]['Start_Time'].dt.year.value_counts().sort_index()
        return [categories,values1,values2,values3,values4]
    
    if (TimeInterval == 'Daily'):
        categories = df['Start_Time'].dt.dayofweek.value_counts().sort_index().index
        values1 = Severity[0]['Start_Time'].dt.dayofweek.value_counts().sort_index()
        values2 = Severity[1]['Start_Time'].dt.dayofweek.value_counts().sort_index()
        values3 = Severity[2]['Start_Time'].dt.dayofweek.value_counts().sort_index()
        values4 = Severity[3]['Start_Time'].dt.year.value_counts().sort_index()
        return [categories,values1,values2,values3,values4]
    
    if (TimeInterval == 'Hourly'):
        categories = df['Start_Time'].dt.hour.value_counts().sort_index().index
        values1 = Severity[0]['Start_Time'].dt.hour.value_counts().sort_index()
        values2 = Severity[1]['Start_Time'].dt.hour.value_counts().sort_index()
        values3 = Severity[2]['Start_Time'].dt.hour.value_counts().sort_index()
        values4 = Severity[3]['Start_Time'].dt.year.value_counts().sort_index()
        return [categories,values1,values2,values3,values4]

In [54]:
def MultiBarChart(df, TimeInterval, Sampling_Factor):
    #devide dataset on severity of the accident
    Severity = [df[df.Severity == 1], df[df.Severity == 2], df[df.Severity == 3], df[df.Severity == 4]]
    [categories,values1,values2,values3,values4] = GrouBySeverity(df, TimeInterval, Severity)
        
    # create single bars
    trace1 = go.Bar(
        x=categories,
        y=values1.values * Sampling_Factor,
        name='Very Light',
        marker=dict(color = GRADE1COLOR),
        textposition='auto'
    )
    trace2 = go.Bar(
        x=categories,
        y=values2.values * Sampling_Factor,
        name='Light',
        marker=dict(color = GRADE2COLOR),
        textposition='auto'
    )
    trace3 = go.Bar(
        x=categories,
        y=values3.values * Sampling_Factor,
        name='Medium',
        marker=dict(color = GRADE3COLOR),
        textposition='auto'
    )
    trace4 = go.Bar(
        x=categories,
        y=values4.values * Sampling_Factor,
        name='High',
        marker=dict(color = GRADE4COLOR),
        textposition='auto'
    )

    # Create the figure
    fig = go.Figure(data=[trace1, trace2, trace3, trace4])

    # Update layout for better visualization
    fig.update_layout(
        title='Effect on Traffic',
        xaxis=dict(title='Categories'),
        yaxis=dict(title='Values'),
        barmode='group',  # This will group the bars side by side
        bargap=0.15,      # Gap between bars of adjacent location coordinates
        bargroupgap=0.1   # Gap between bars of the same location coordinate
    )

    return fig

In [55]:
MultiBarChart(df_acc, 'Hourly', Sampling_Factor).show()

__Second Graph (pie chart)__

In [56]:
def PieChart(df_acc, time_interval):
    # Filter the DataFrame based on the provided time interval
    if time_interval != 'all':
        df_acc = df_acc[df_acc['Start_Time'].dt.year == int(time_interval)]

    # Calculate the distribution of 'Severity'
    severity_counts = df_acc['Severity'].value_counts().sort_index()

    # Define the severity names corresponding to severity counts index
    severity_names = ['Very Light', 'Light', 'Medium', 'High']

    # Create a mapping from severity index to names
    severity_index_to_name = {i: name for i, name in zip(severity_counts.index, severity_names)}

    # Map the severity index to names
    severity_counts.index = severity_counts.index.map(severity_index_to_name)

    # Create a pie chart
    fig = px.pie(values=severity_counts, 
                 names=severity_counts.index,
                 title="Distribution of Severity",
                 color=severity_counts.index,  # Specify the column for the colors
                 color_discrete_map={
                     'Very Light': GRADE1COLOR,
                     'Light': GRADE2COLOR,
                     'Medium': GRADE3COLOR,
                     'High': GRADE4COLOR
                    }
                )

    # Update the layout for the legend
    fig.update_layout(legend_title_text='Effect on traffic')

    return fig

In [57]:
PieChart(df_acc, '2020').show()

__Third Graph (orizontal bar chart)__

In [58]:
def BestWorstAcc(df_acc, df_pop, time_interval, orderby, SampleRescalingFactor):
    
    # Group accident data by state and year
    accidents_grouped = df_acc.groupby(['State', 'Year']).size().reset_index(name='Accident_Count')

    # Reshape population data from wide to long format
    population_long = pd.melt(df_pop, id_vars=['Year'], var_name='State', value_name='Population')

    # Merge population data with accidents data
    merged_data = pd.merge(accidents_grouped, population_long, how='inner', on=['State', 'Year'])

    # Calculate accident rate per 100,000 population
    merged_data['Accident_Rate_per_100k'] = (merged_data['Accident_Count'] / merged_data['Population']) * 100000 * SampleRescalingFactor

    data = merged_data
    # Select data for a specific year
    if(time_interval != 'all'):
        data = merged_data[merged_data['Year'] == int(time_interval)]

    if(orderby=='WorstToBest'):
        # Sort the data by Accident_Rate_per_100k in descending order
        data = data.sort_values(by='Accident_Rate_per_100k', ascending=False)
    if(orderby=='BestToWorst'):
        data = data.sort_values(by='Accident_Rate_per_100k', ascending=True)
    
    # Create the bar chart using Plotly
    fig = px.bar(
        data,
        y='State',
        x='Accident_Rate_per_100k',
        title=f'Accidents per 100,000 Residents by State in {time_interval}',
        labels={'Accident_Rate_per_100k': 'Accidents per 100,000 Residents'},
        height=850
    )
    return fig

In [59]:
BestWorstAcc(df_acc, df_pop, '2020', 'WorstToBest', Sampling_Factor).show()

__Graph=Weather conditions__

In [None]:
#df_acc['Temperature(F)']

# convert temperature from fareneight to celsius
df_acc['Temperature(F)'] = (df_acc['Temperature(F)'] - 32) * 5.0/9.0
df_acc.rename(columns={'Temperature(F)': 'Tempetature(C)'}, inplace=True)








__Fourth Graph (scatterplot)__

In [75]:
df_acc['Tempetature(C)'].mean()

np.float64(-22.475299975136423)

In [61]:
def ScatterPloLocations(df_acc):
    df_acc = df_acc[df_acc['Severity'] == 1]
    return 0


#mask_query = ((data["Salario"]<1500) & (data["Età"] <=25))

#print(data[data["Salario"]<1500]["Salario"].head())

#data.loc[data["Salario"]<1500, "Salario"] = data[data["Salario"]<1500]["Salario"]+150

#print(data[mask_query]["Salario"].head())

In [62]:
# Load the dataset
#df = pd.read_csv('Dataset\\US_Accidents_Sampled.csv')

#a random sample of approximately 10% of the rows from the DataFrame df.
#sample_df=df.sample(int(1*len(df))) 
#sns.scatterplot(x=df.Start_Lng, y=sample_df.Start_Lat, size=0.001)

#px.scatter(df_acc, x='Start_Lng', y='Start_Lat', color = 'Severity', width=500, 
           #height=50
           #).show()

In [63]:
# whether conditions
#import pandas as pd

# Load the dataset
#data = pd.read_csv('Dataset\\US_Accidents_Sampled.csv')

# Count the frequency of accidents for each weather condition
#weather_accidents = data['Weather_Condition'].value_counts().reset_index()
#weather_accidents.columns = ['Weather_Condition', 'Frequency']

In [64]:
#import plotly.express as px

# Create a bar plot
#fig = px.bar(weather_accidents, x='Weather_Condition', y='Frequency',
#             title='Frequency of Accidents by Weather Condition',
#             labels={'Weather_Condition': 'Weather Condition', 'Frequency': 'Number of Accidents'},
#             template='plotly_white')

# Show the plot
#fig.show()

In [65]:
# Create a pivot table for heatmap
#pivot_table = pd.pivot_table(data, values='ID', index='Visibility(mi)', columns='Temperature(F)', aggfunc='count').fillna(0)

# Create a heatmap
#fig = px.imshow(data,
#                labels=dict(x="Visibility(mi)", y="Temperature(F)", color="Number of Accidents"),
#                title='Heatmap of Accidents by Weather Condition and Time of Day',
 #               template='plotly_white')

# Show the plot
#fig.show()

In [66]:
#import pandas as pd
#import plotly.express as px

# Load the dataset
#data = pd.read_csv('Dataset\\US_Accidents_Sampled.csv')

# Define the bin edges and labels
#bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, float('inf')]

#for element in bins:
    
#labels = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

# Discretize the "Visibility(mi)" column
#data['Visibility(mi)'] = pd.cut(data['Visibility(mi)'], bins=bins, right=False)

# Filter relevant columns and handle missing values
#filtered_data = data[['ID', 'Precipitation(in)', 'Visibility(mi)']]
#filtered_data.dropna(subset=['Precipitation(in)', 'Visibility(mi)'], inplace=True)




# Create a pivot table

#pivot_table = pd.pivot_table(filtered_data, values='ID', index='Precipitation(in)', columns='Visibility(mi)', aggfunc='count').fillna(0)

# Create a heatmap
#fig = px.imshow(pivot_table,
#                labels=dict(x="Visibility (mi)", y="Precipitation (in)", color="Number of Accidents"),
#                title='Heatmap of Accidents by Precipitation and Visibility',
#                template='plotly_white')

# Customize the color scale for better visibility
#fig.update_layout(coloraxis_colorscale='Viridis')

# Show the plot
#fig.show()
#data['Visibility(mi)']

In [67]:
#import pandas as pd

# Load the dataset
#data = pd.read_csv('Dataset\\US_Accidents_Sampled.csv')

# Define the bins and labels
#bins = list(range(0, 11, 1))  # Bins: [0-10), [10-20), ..., [100-110)
#labels = [i for i in range(0, 10, 1)]  # Labels: 0, 10, 20, ..., 90

# Discretize the 'Visibility(mi)' column
#data['Visibility_Discretized'] = pd.cut(data['Visibility(mi)'], bins=bins, labels=labels, right=False)

# Optional: Convert labels to integer type for convenience
#data['Visibility_Discretized'] = data['Visibility_Discretized'].astype(float).fillna(100).astype(int)

# Display the first few rows to verify
#print(data[['Visibility(mi)', 'Visibility_Discretized']].head())
# Seleziona la colonna categoriale (sostituisci 'colonna_categoriale' con il nome effettivo della tua colonna)
#condizioni_meteo = data['Weather_Condition']

# Utilizza il metodo value_counts() per ottenere il conteggio delle voci uniche
#count = condizioni_meteo.value_counts()

# Stampa i risultati
#print(count)

In [68]:
import pandas as pd
import seaborn as sns
import plotly.express as px
from constants import *

# Load the dataset
df = pd.read_csv('Dataset\\US_Accidents_Sampled.csv')

#a random sample of approximately 10% of the rows from the DataFrame df.
sample_df=df.sample(int(1*len(df))) 
#sns.scatterplot(x=df.Start_Lng, y=sample_df.Start_Lat, size=0.001)

px.scatter(df, x='Start_Lng', y='Start_Lat', color = 'Severity', width=500, 
           #height=50
           ).show()


