In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("US_Accidents_March23.csv")
pd.set_option("display.max_columns", None)
data.head(5)

In [None]:
data.drop(['End_Lat','End_Lng'],axis=1,inplace=True)
data

In [None]:
data.columns

In [None]:
data.shape # we have 7.7M records in our dataset

In [None]:
# Checking for null values
data.isnull().sum().sort_values(ascending=False)

In [None]:
# Filling null values...
data["Visibility(mi)"].fillna(data["Visibility(mi)"].mean(),inplace=True)
data["Temperature(F)"].fillna(data["Temperature(F)"].mean(),inplace=True)
data["Pressure(in)"].fillna(data["Pressure(in)"].mean(),inplace=True)

In [None]:
# Checking for null values again
data.isnull().sum().sort_values(ascending=False)

In [None]:
data.info() # checking for the data types

In [None]:
data.duplicated().sum() # check if any duplicate value is present

In [None]:
# changing datatype of the column End_Time to extract day,month,year and other details
data['End_Time']=data['End_Time'].str.replace(r'\.\d+','',regex=True)
data['End_Time']=pd.to_datetime(data['End_Time'],format="%Y-%m-%d %H:%M:%S")
data["Year"]=data["End_Time"].dt.year
data['Month']=data['End_Time'].dt.month
data['Day']=data['End_Time'].dt.day
data['Hour']=data['End_Time'].dt.hour
data['Minute']=data['End_Time'].dt.minute
data['Second']=data['End_Time'].dt.second
data["Date"]=data["End_Time"].dt.date
data["Time"]=data["End_Time"].dt.time

In [None]:
# Combining Weather_Condition and Visibility columns...

data["w_v_combined"] = data["Visibility(mi)"].astype(str) + " " + data["Weather_Condition"]
data["w_v_combined"]

In [None]:
data.info() # checking for the data types again

# UNIVARIATE ANALYSIS

In [None]:
# Summary statistics of the data
data.describe()

In [None]:
# Row accident data  from different sources
row_data_sources=data.Source.value_counts()
sns.barplot(x=row_data_sources.index,y=row_data_sources,palette="viridis")
sns.despine(right=True)
for i,v in enumerate(row_data_sources):
    plt.text(i,v+1,str(v),ha="center",va="bottom")



In [None]:
# State wise distribution of accidents
plt.figure(figsize=(16,10))
sns.countplot(data=data,x=data.State,order=['CA','FL','TX','SC','NY','NC','VA','PA','MN','OR','AZ','GA'])
plt.xticks(rotation=90,size=14)
plt.yticks(size=14)
plt.ylabel(None)
plt.title("State wise distribution of accidents",fontsize=14)
plt.show()

In [None]:
# Top 20 hotspots cities of accidents in california(CA)
CA_data=data[data["State"]=="CA"]
CA_City=CA_data.groupby("City")["City"].count().sort_values(ascending=False).head(20)
sns.barplot(x=CA_City.index,y=CA_City)
sns.despine(right=True)
for i,v in enumerate(CA_City):
    plt.text(i,v+1,str(v),rotation=30)
plt.xticks(rotation=90)
plt.ylabel(None)
plt.show()

In [None]:
# Top cities with highest number of accidents
city_data=data.groupby("City")["City"].count().sort_values(ascending=False).head(20)

In [None]:
sns.barplot(x=city_data.index,y=city_data)
sns.despine(right=True)
for i,v in enumerate(city_data):
    plt.text(i,v+1,str(v),rotation=30)
plt.xticks(rotation=90)
plt.ylabel(None)
plt.show()

In [None]:
# Top 50 cities with Highest accident rates
plt.figure(figsize=(12,6))
city_wise_groupd=data.groupby("City")["City"].count().sort_values(ascending=False).head(50)
city_wise_groupd.plot(kind="bar")
for i,v in enumerate(city_wise_groupd):
    plt.text(i,v+1,str(v),ha="center",va="bottom",rotation=90,size=11)
plt.ylabel("Number of accidents")
plt.xlabel("City")
plt.title("Top 50 City with Highest accident rates")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Year wise number of accidents...
year_data=data.groupby("Year")["ID"].count()
sns.barplot(x=year_data.index,y=year_data,palette="rocket")
sns.despine(right=True)
for i,v in enumerate(year_data):
    plt.text(i,v+1,str(v),ha="center",va="bottom",size=9)
plt.ylabel(None)
plt.title("Year wise number of accidents",size=14)
plt.show()

In [None]:
# Time zone wise accidents
nm=data.groupby("Timezone")["Timezone"].count().sort_values(ascending=False)
sns.barplot(x=nm.index,y=nm)
sns.despine(right=True)
plt.ylabel(None)
plt.title("Time zone wise accidents")
plt.ylim([0,4000000])
for i,v in enumerate(nm):
    plt.text(i,v+1,str(v),ha="center",va="bottom")
plt.show()

In [None]:
# Wind direction wise distribution of the accidents
plt.figure(figsize=(10,6))
wind_dir=data.Wind_Direction.value_counts().sort_values(ascending=False)
sns.barplot(y=wind_dir.index,x=wind_dir,palette="rocket")
sns.despine(right=True)
plt.ylabel(None)
plt.title("Wind direction during accidents")
plt.show()

In [None]:
# crossing data distribution
crossing=data.groupby("Crossing")["Crossing"].count()
print(crossing)
plt.pie(crossing,labels=crossing.index,autopct="%1.1f%%",shadow=True,textprops={"color":"black","size":20})
plt.title("crossing during accident")
plt.legend()
plt.show()

In [None]:
# juntion data distribution
junction=data.groupby("Junction")["Junction"].count()
print(junction)
plt.pie(junction,labels=junction.index,autopct="%1.1f%%",shadow=True,textprops={"color":"black","size":20})
plt.title("junction during accident")
plt.legend()
plt.show()

In [None]:
# station data distribution
station=data.groupby("Station")["Station"].count()
print(station)
plt.pie(station,labels=station.index,autopct="%1.1f%%",shadow=True,textprops={"color":"black","size":12})
plt.title("station during accident")
plt.legend()
plt.show()

In [None]:
# Accidents day vs night
day_night=data.Sunrise_Sunset.value_counts()
print(day_night)
plt.pie(day_night,labels=day_night.index,autopct="%1.1f%%",shadow=True,textprops={"color":"black","size":20})
plt.show()

In [None]:
# Percentage of severity levels according to accidents
severity_per=data.groupby("Severity")["Severity"].count()
print(severity_per)
plt.pie(severity_per,labels=severity_per.index,autopct="%1.1f%%",shadow=True,textprops={"size":12,"rotation":45})
plt.title("Severity level percentage according to accidents")
plt.show()

In [None]:
# Top weather condition when most of the accidents occure
plt.figure(figsize=(18,8))
weather_cond=data.groupby("Weather_Condition")["Weather_Condition"].count().sort_values(ascending=False).head(20)
sns.barplot(x=weather_cond.index,y=weather_cond)
sns.despine(right=True)
for i,v in enumerate(weather_cond):
    plt.text(i,v+1,str(v),ha="center",va="bottom",size=14,rotation=15)
plt.xticks(rotation=45,size=14)
plt.ylabel(None)
plt.title("Top weather condition when most of the accidents occure",fontsize=14)
plt.show()

In [None]:
# Railway data distribution
railway=data.groupby("Railway")["Railway"].count()
print(railway)
plt.pie(railway,labels=railway.index,autopct="%1.1f%%",shadow=True,textprops={"size":13})
plt.show()

In [None]:
# average accidents on daily basis
date_wise=data.groupby("Date")["ID"].count()
date_wise.mean()

In [None]:
# average accidents per year
year_data=data.groupby("Year")["ID"].count()
avg_acc=year_data.mean()
print(year_data)
print("average accidents per year: ",f'\033[1m{avg_acc}')

# BIVARIATE ANALYSIS

In [None]:
# Severity and Weather Conditions
# Double-Click to zome the chart
plt.figure(figsize=(50,20))
sns.countplot(data=data,x="Weather_Condition",hue="Severity",palette="husl")
plt.xticks(rotation=90,size=20)
plt.xlabel("Weather_Conditions",size=20)
plt.ylabel(None)
plt.title("Severity and Weather Conditions",size=20)
plt.show()

In [None]:
# Severity and States
plt.figure(figsize=(15,6))
sns.countplot(data=data,x="State",hue="Severity")
plt.xticks(rotation=90,size=10)
plt.yticks(size=12)
plt.ylabel(None)
plt.xlabel("States",size=14)
plt.title("Severity and States",size=15)
plt.show()

In [None]:
# Temperature and Pressure...
columns=data[["Temperature(F)","Pressure(in)"]]
plot_data=columns[columns.notnull().all(axis=1)]
sns.jointplot(data=plot_data,x="Temperature(F)",y="Pressure(in)")
plt.show()

In [None]:
#Temperature distribution during different weather conditions
sns.scatterplot(data,x="Weather_Condition",y="Temperature(F)")
fig = plt.gcf()
fig.set_size_inches(30,10)
plt.xticks(rotation=90,size=14)
plt.title("Temperature distribution during different weather conditions",size=20)
plt.show()

In [None]:
# State and Temperature
sns.scatterplot(data,x="State",y="Temperature(F)")
fig = plt.gcf()
fig.set_size_inches(30,10)
plt.xticks(rotation=90,size=14)
plt.title("State and Temperature",size=15)
plt.show()

In [None]:
# Average visibility in different Weather conditions
# Double-Click to zoom the chart
plt.figure(figsize=(25,10))
avg_visibility=data.groupby('Weather_Condition')['Visibility(mi)'].mean()
sns.barplot(x=avg_visibility.index,y=avg_visibility)
sns.despine(right=True)
plt.title("Average visibility in different Weather conditions",size=20)
plt.ylabel("Visibility(mi)",size=20)
plt.xlabel("Weather Conditions",size=20)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Wind spead and Temperature
sns.relplot(data,x="Temperature(F)",y="Wind_Speed(mph)")
plt.title("Wind spead and Temperature",size=14)
fig=plt.gcf()
fig.set_size_inches(20,7)
plt.show()

# MULTIVARIAT ANALYSIS

In [None]:
num_cols=data.select_dtypes(include="number").columns.tolist()
num_data=data[['Severity',
'Start_Lat',
'Start_Lng',
'Distance(mi)',
'Temperature(F)',
'Wind_Chill(F)',
'Humidity(%)',
'Pressure(in)',
'Visibility(mi)',
'Wind_Speed(mph)',
'Precipitation(in)',
'Year',
'Month',
'Day',
'Hour',
'Minute',
'Second']]
plt.figure(figsize=(8,6)) # Adjust the figure size if needed
correlation_matrix=num_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation matrix for numeric columns")
plt.show()