# VI: First Practical Work

**Authors:** Gerard Comas & Marc Franquesa.

In [None]:
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

collisions = pd.read_csv("./processed-data/collisions.csv")
map_data = gpd.read_file("./processed-data/map.geojson")
weather = pd.read_csv("./processed-data/weather.csv")

## Design and implementation

**Q IDEAS:**

* Q1: basic barplot?
* Q2: slope chart
* Q3: histogram?
* Q4: basic map plot
* Q5:

---
**O IDEAS:**
* Color for vehicle type


In [None]:
# Helpful functions

def before_covid(df: pd.DataFrame) -> pd.DataFrame:
    return df[df["AFTER COVID"] == False]

def after_covid(df: pd.DataFrame) -> pd.DataFrame:
    return df[df["AFTER COVID"] == True]

### 1. Are accidents more frequent during weekdays or weekends? Is there any difference between before COVID-19 and after?

With an ambitious goal in mind, lets first plot the total collisions of each day of the week before COVID.

In [None]:
before_covid_day_count = before_covid(collisions).groupby(["CRASH WEEKDAY"]).size().reset_index(name="counts")

weekdayorder = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

alt.Chart(before_covid_day_count).mark_bar().encode(
    x = alt.X("CRASH WEEKDAY:O", sort=weekdayorder, axis=alt.Axis(title="Week Day")),
    y = alt.Y("counts:Q", axis=alt.Axis(title="Collisions"))
).properties(
    width=400
)

Lets now make a grouped bar chart, separating before and after covid.

In [None]:
days_df = collisions.groupby(["CRASH WEEKDAY", "AFTER COVID"]).size().reset_index(name="counts")

before, after, all_time = "Summer 2018 (Before Covid)", "Summer 2020 (After Covid)", "All"

days_df["MOMENT"] = np.where(days_df["AFTER COVID"], after, before)

weekdayorder = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

opacity = 0.5

colors = {
    before: "#fdc086", # Before COVID
    after: "#7fc97f", # After COVID
    all_time: "#beaed4"
}

days_ch = alt.Chart(days_df).mark_bar(
    opacity=opacity
).encode(
   x=alt.X("CRASH WEEKDAY:O", axis=alt.Axis(labelAngle=-30, title=None), sort=weekdayorder),
   xOffset="MOMENT:O",
   y=alt.Y("counts:Q", axis=alt.Axis(title="Collisions", grid=True)),
   color=alt.Color("MOMENT:O", scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())), legend=alt.Legend(title=None))
)

days_ch

Lets now add the average of before and after covid.

In [None]:
averages = alt.Chart(days_df).mark_rule(opacity=1).encode(
    y="mean(counts):Q",
    size=alt.value(2),
    color="MOMENT:O"
)

averages + days_ch

Lets now separate the days of the week in two categories, weekdays and weekends.

In [None]:
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
weekends = ["Saturday", "Sunday"]

weekdays_df = days_df[days_df["CRASH WEEKDAY"].isin(weekdays)]
weekends_df = days_df[days_df["CRASH WEEKDAY"].isin(weekends)]

weekdays_ch = alt.Chart(weekdays_df).mark_bar(opacity=opacity).encode(
   x=alt.X("CRASH WEEKDAY:O", axis=alt.Axis(labelAngle=-30, title=None), sort=weekdayorder),
   xOffset="MOMENT:O",
   y=alt.Y("counts:Q", axis=alt.Axis(title="Collisions / Means", grid=True), scale=alt.Scale(domain=[0, 13000])),
   color=alt.Color("MOMENT:O", scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())))
).properties(title=alt.Title("Weekdays", fontSize=10, fontWeight=600))

averages_weekday = alt.Chart(weekdays_df).mark_rule(opacity=1).encode(
    y="mean(counts):Q",
    size=alt.value(2),
    color=alt.Color("MOMENT:O")
)


weekends_ch = alt.Chart(weekends_df).mark_bar(opacity=opacity).encode(
   x=alt.X("CRASH WEEKDAY:O", axis=alt.Axis(labelAngle=-30, title=None), sort=weekdayorder),
   xOffset="MOMENT:O",
   y=alt.Y(
       "counts:Q",
       axis=alt.Axis(title=None, labels=False, domain=False, ticks=False, grid=True),
       scale=alt.Scale(domain=[0, 13000])
   ),
   color=alt.Color(
       "MOMENT:O",
       scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())),
       legend=alt.Legend(title=None)
   )
).properties(title=alt.Title("Weekends", fontSize=10, fontWeight=600))

averages_weekend = alt.Chart(weekends_df).mark_rule(opacity=1).encode(
    y="mean(counts):Q",
    size=alt.value(2),
    color="MOMENT:O"
)



q1 = ((weekdays_ch + averages_weekday) | (weekends_ch + averages_weekend))

q1.configure_legend(symbolOpacity=1)

### 2. Is there any type of vehicle more prone to participate in accidents?
Obviously, with the current data we have this is impossible, as cars are the most predominant vehicle by a large margin, meaning they will have the most collisions. Lets start off viewing this data with a simle bar plot.

In [None]:
vehicles = collisions.groupby(["VEHICLE"]).size().reset_index(name="counts")

alt.Chart(vehicles).mark_bar().encode(
    y=alt.Y("counts:Q", axis=alt.Axis(title="Collisions")),
    x=alt.X("VEHICLE:O", axis=alt.Axis(title=None, labelAngle=-30))
).properties(
    width=400
)

This confirms what we hypothesized earlier.

La primera idea que vam tenir va ser la de fer un paralel coordinate plane, on tinguessim els seguents plans:
- Percentatge d'accidents
- Percentatge de circulació
- Percentatge de ferits
- Percentatge de morts
- Ratio de ferits/accident
- Ratio de ferits/mort

Però a les dades proporcioandes no disposem del percentatge de circulació de cada vehicle i buscant per internet no hem trobat cap dataset que ens pugui proporcionar aquesta informació. Ara mirarem com es distribueixen els seguents plans:

In [None]:
vehicles = collisions[["VEHICLE","NUMBER OF PERSONS INJURED", "NUMBER OF PERSONS KILLED"]]
vehicles = vehicles[vehicles["VEHICLE"] != "Unknown"]

vehicles = vehicles.groupby("VEHICLE").agg({
    "VEHICLE": "count",
    "NUMBER OF PERSONS INJURED": "sum",
    "NUMBER OF PERSONS KILLED": "sum"
}).rename(columns={"VEHICLE": "COLLISIONS"}).reset_index()

total_collisions = vehicles["COLLISIONS"].sum()

# Calcular el número de accidentes por tipo de vehículo
vehicles["% COLLISIONS"] = vehicles["COLLISIONS"] / total_collisions * 100

# Calcular el número total de personas heridas y muertas en todos los accidentes
total_injured = vehicles["NUMBER OF PERSONS INJURED"].sum()
total_killed = vehicles["NUMBER OF PERSONS KILLED"].sum()

# Calcular los porcentajes de personas heridas y muertas para cada tipo de vehículo
vehicles["% INJURED"] = vehicles["NUMBER OF PERSONS INJURED"] / total_injured * 100
vehicles["% KILLED"] = vehicles["NUMBER OF PERSONS KILLED"] / total_killed * 100

# Calcular los ratios de personas heridas y muertas por accidente para cada tipo de vehículo
vehicles["INJURED PER COLLISION"] = vehicles["NUMBER OF PERSONS INJURED"] / vehicles["COLLISIONS"]
vehicles["KILLED PER COLLISION"] = vehicles["NUMBER OF PERSONS KILLED"] / vehicles["COLLISIONS"]

vehicles.head()

In [None]:
base = alt.Chart(vehicles, width=800).transform_window(
    index="count()"
).transform_fold(
    ["% COLLISIONS", "INJURED PER COLLISION", "KILLED PER COLLISION"]
).transform_joinaggregate(
    min="min(value)",
    max="max(value)",
    groupby=["key"]
).transform_calculate(
    norm_val="(datum.value - datum.min) / (datum.max - datum.min)",
    mid="(datum.min + datum.max) / 2"
)

lines = base.mark_line(opacity=0.3).encode(
    x="key:N",
    y= alt.Y("norm_val:Q", axis=None),
    color="VEHICLE:N",
    detail="index:N",
    opacity=alt.value(0.5)
)

rules = base.mark_rule(
    color="#ccc", tooltip=None
).encode(
    x="key:N",
    detail="count():Q",
) 

def ytick(yvalue, field):
    scale = base.encode(x="key:N", y=alt.value(yvalue), text=f"min({field}):Q")
    return alt.layer(
        scale.mark_text(baseline="middle", align="right", dx=-5, tooltip=None),
        scale.mark_tick(size=8, color="#ccc", orient="horizontal", tooltip=None)
    )

alt.layer(
    lines, rules ,ytick(0, "max"), ytick(150, "mid"), ytick(300, "min")
).configure_axisX(
    domain=False, labelAngle=0, tickColor="#ccc", title=None
).configure_view(
    stroke=None
)

Lets now try a scatter plot.

In [None]:
maximum = max(vehicles["COLLISIONS"])
minimum = min(vehicles["COLLISIONS"])
mean = vehicles["COLLISIONS"].mean()

# Using purple color as it represents the entire collision count
scatter = alt.Chart(vehicles).mark_circle(color=colors[all_time]).encode(
    x=alt.X("INJURED PER COLLISION:Q", axis=alt.Axis(title="Injured per collision")),
    y=alt.Y("KILLED PER COLLISION:Q", axis=alt.Axis(title="Deaths per collision")),
    size=alt.Size("COLLISIONS:Q", scale=alt.Scale(range=[10, 700]), legend=alt.Legend(title="Total collisions (min-mean-max)", values=[minimum, mean, maximum])),
).properties(
    width=500,
    height=300
)

# Lets add labels for each vehicle
labels = scatter.mark_text(
    align="right",
    dx=-15,
    dy=0
).encode(
    text="VEHICLE:N",
    size=alt.value(10)
)

q2 = scatter + labels

q2

This one seems to be easier to understand and also looks nicer, we have decided to keep this one.

In [None]:
(q1 & q2).configure_legend(symbolOpacity=1).resolve_scale(size="independent")

### 3. At what time of the day are accidents more common?
Lets make a simpler historgram with the overall average as well as a little mark indicating the max hour.

In [None]:
time_df = collisions
time_df["HOUR"] = pd.to_datetime(time_df["CRASH DATETIME"]).dt.hour
time_df = time_df.groupby(["HOUR", "AFTER COVID"]).size().reset_index(name="counts")

time_df["MOMENT"] = np.where(time_df["AFTER COVID"], after, before)

time_ch = alt.Chart(time_df).mark_bar(opacity=opacity).encode(
    x=alt.X("HOUR:O", axis=alt.Axis(labelAngle=0), title="Hour"),
    y=alt.Y("counts:Q", title="Collisions / Mean"),
    color=alt.Color(
        "MOMENT:O",
        scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())),
        legend=alt.Legend(title=None)
    ),
    order=alt.Order("MOMENT:O", sort="ascending")
)

time_all_df = time_df.groupby(["HOUR"]).sum().reset_index()

averages_weekend = alt.Chart(time_all_df).mark_rule(opacity=1, color=colors[all_time]).encode(
    y="mean(counts):Q",
    size=alt.value(2),
)

max_hour = alt.Chart().mark_text(text=str(sum(time_df.loc[time_df["HOUR"] == 16, "counts"])), angle=0).encode(
    x=alt.value(330),
    y=alt.value(20),
)

q3 = (time_ch + averages_weekend + max_hour)
q3

In [None]:
((q1 | q3) & q2).configure_legend(symbolOpacity=1).resolve_scale(size="independent")

### 4. Are there any areas with a larger number of accidents?
Lets make a choropleth map. First, lets just a couple collisions in NYC. We are using a district map.

In [None]:
base = alt.Chart(map_data).mark_geoshape(fill="lightgray", stroke="black").project(type="albersUsa").properties(
    width=700,
    height=700
)

pts = alt.Chart(collisions[collisions["LOCATION"].notna()].head(5000)).mark_circle().encode(
    latitude="LATITUDE",
    longitude="LONGITUDE",
    color='BOROUGH',
    tooltip=['LATITUDE', "LONGITUDE"]
)

(base + pts)

Now making the Choropleth Map! We will be using the purple scale as we will be using the entire dataset, not just before/after covid. Keep in mind that we will only be looking at area, there are other factors too, like total km of streets. However, we have decided to go with this path as any other variable would be tricky to use.

In [None]:
base = alt.Chart(map_data).mark_geoshape().project(type="albersUsa").encode(
    color=alt.Color("collision_count:Q", scale=alt.Scale(scheme='purples'), legend=alt.Legend(title="Collisions")),
).properties(
    width=600,
    height=600,
    title="NYC Community Districts"
)

base

Lets add labels to the top 3 areas with most collisions. Only 3 as getting too many more would overcrowd the map. Getting the labels from [here](https://furmancenter.org/files/sotc/SOC2007_IndexofCommunityDistricts_000.pdf). Using the centroids of the areas to get where to place the labels. Lets see how that looks. 

In [None]:
top = map_data.sort_values(by="collision_count", ascending=False).head(3)
top[["LATITUDE", "LONGITUDE"]] = top["geometry"].centroid.apply(lambda x: pd.Series([x.y, x.x]))

# 
labels = {
    "boro_cd": ["412", "413", "305"],
    "LABELS": ["Jamaica / Hollis", "Queens Village", "East New York"]
}

top = top.merge(pd.DataFrame(labels), left_on="boro_cd", right_on="boro_cd")

text_labels = alt.Chart(top).mark_text(angle=0, dx=0, dy=0, fill="white", size=8).encode(
    longitude='LONGITUDE:Q',
    latitude='LATITUDE:Q',
    text='LABELS:N',
)

base + text_labels

Labels are good except the Queens Village, which is barely visible. Lets place it where it can be read correctly. And lets add a couple icons for "interesting vehicles"!. These icons will be wherever they collided!

In [None]:
top.loc[top["LABELS"] == "Queens Village", ["LATITUDE", "LONGITUDE"]] = [40.66605, -73.75998]


text_labels = alt.Chart(top).mark_text(angle=0, dx=0, dy=0, fill="white", size=9).encode(
    longitude="LONGITUDE:Q",
    latitude="LATITUDE:Q",
    text="LABELS:N",
)


horse = alt.Chart(collisions[collisions["ORIGINAL VEHICLE"] == "Horse"]).mark_text(text="🐎", size=18).encode(
    longitude="LONGITUDE:Q",
    latitude="LATITUDE:Q",
)

gokart = alt.Chart(collisions[collisions["ORIGINAL VEHICLE"] == "Go kart"]).mark_text(text="🏎️", size=18).encode(
    longitude="LONGITUDE:Q",
    latitude="LATITUDE:Q",
)


q4 = (base + text_labels + horse + gokart).properties(width=600, height=600)

q4

Great! Lets now put it all together.

In [None]:
((q4 | (q1 & q3)) & q2).configure_legend(symbolOpacity=1).resolve_scale(size="independent", color="shared")

### 5.  Is there a correlation between weather conditions and accidents?

In [None]:
# Read weather data
weather = pd.read_csv("./processed-data/weather.csv")

weather_corr = weather.drop(columns=["valid"]).corr()
weather_corr

In [None]:
# reshape the data into a long format
corr_long = weather_corr.stack().reset_index()
corr_long.columns = ['x', 'y', 'value']

# create the heatmap
heatmap = alt.Chart(corr_long).mark_rect().encode(
    x='x:O',
    y='y:O',
    color='value:Q'
).properties(
    width=300,
    height=300
)

# add text to the heatmap
text = heatmap.mark_text(baseline='middle').encode(
    text=alt.Text('value:Q', format='.2f'),
    color=alt.condition(
        alt.datum.value > 0.5,
        alt.value('white'),
        alt.value('black')
    )
)

heatmap + text

From this heatmap, we can see that there is a significant relationship between the columns `vsby` and `relh`; low visibility values are associated with high relative humidity values. There is also a strong correlation between the columns `relh` and `tmpf`. All of this makes a lot of sense when we consider the thermodynamics relation between climatic variables.

In [None]:
# read the collision dataset
collisions['DATE'] = pd.to_datetime(collisions['CRASH DATETIME'])
weather['DATE'] = pd.to_datetime(weather['valid'])


# merge the two datasets on the common column "DATE"
collisions_weather = pd.merge(collisions, weather, on="DATE")

# print the merged dataset
print(collisions_weather.columns)

# select the columns we want to keep
collisions_weather_selected  = collisions_weather[['DATE', 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED', 'VEHICLE',  'tmpf', 'relh', 'sknt', 'p01i', 'vsby']]

alt.data_transformers.disable_max_rows()

In [None]:
def violinPlot(dataset, column, rang):
    color = '#7fc97fbb' if dataset.equals(weather) else '#beaed4'
    title = 'Normal' if dataset.equals(weather) else 'Collisions'
    orient = 'right' if dataset.equals(weather) else 'left'
    chart = alt.Chart(dataset , width=100).transform_density(
        column,
        as_=[column, 'density'],
        extent= rang
    ).mark_area(orient='horizontal', color = color).encode(
        alt.X('density:Q')
            .stack('center')
            .impute(None)
            .title(None)
            .axis(labels=False, values=[0], grid=False, ticks=True),
        alt.Y(column + ':Q').title(title).axis(titleColor=color, orient=orient)
    )

    # Calculate quartiles
    q1 = dataset[column].quantile(0.25)
    q2 = dataset[column].quantile(0.5)
    q3 = dataset[column].quantile(0.75)

    # Add quartiles as horizontal lines
    q1_r = alt.Chart(pd.DataFrame({'y': [q1]})).mark_rule(color='#fee0d2', strokeWidth=2).encode(y='y')
    q2_r = alt.Chart(pd.DataFrame({'y': [q2]})).mark_rule(color='#fc9272', strokeWidth=2).encode(y='y')
    q3_r = alt.Chart(pd.DataFrame({'y': [q3]})).mark_rule(color='#de2d26', strokeWidth=2).encode(y='y')

    return chart + q1_r + q2_r + q3_r

(violinPlot(collisions_weather_selected, 'tmpf', [5, 45]) | 
 violinPlot(weather, 'tmpf', [5, 45])
).properties(
    title = "Temperature"
) | (violinPlot(collisions_weather_selected, 'relh', [0, 100]) | 
 violinPlot(weather, 'relh', [0, 100])
).properties(
    title = "Humidity"
) | (violinPlot(collisions_weather_selected, 'sknt', [0, 25]) | 
 violinPlot(weather, 'sknt', [0, 25])
).properties(
    title = "Speed of wind"
) | (violinPlot(collisions_weather_selected, 'p01i', [0, 0.5]) | 
 violinPlot(weather, 'p01i', [0, 0.5])
).properties(
    title = "Rainfall level"
) | (violinPlot(collisions_weather_selected, 'vsby', [0, 20]) | 
 violinPlot(weather, 'vsby', [0, 20])
).properties(
    title = "Visibility"
)


With this plot, we can compare the distribution of climatic variables when accidents occur versus their distribution at all times. The intention behind creating this graph was to help us understand if these distributions vary when accidents happen, that is, whether certain meteorological variables affect the number of accidents.

In some cases, we observe slightly different distributions, but it's not easy to compare them directly in this form. We need to find another way to address the question. The idea will be to look for the most extreme cases. We'll start with visibility, where we clearly see that the first quartile is at a lower level when accidents occur.

#### Visibility

In [None]:
print(f"Visibility in accidents: {collisions_weather_selected['vsby'].describe()}")
print(f"Visibility  in general: {weather['vsby'].describe()}")


Based on this data, it can be concluded that when visibility is lower, there are more accidents, as indicated by the lower mean and the lower first quartile. The other quartiles are at a value of 16.093440 kilometers, equivalent to 10 miles, which is considered complete visibility according to the data source.

However, it would be essential to study the probability of an accident with low visibility compared to the probability of an accident with high visibility.

A clearer way to represent this would be the following:

Histogram where the X-axis represents visibility, and the Y-axis represents the number of accidents/occurrences in weather conditions.
This way, we can determine the collision ratio, providing more valuable information about the likelihood of accidents concerning different visibility conditions.

In [None]:
# create 17 bins for the vsby column
bins = pd.cut(collisions_weather_selected.dropna(subset=["vsby"])["vsby"], bins=17, labels=list(range(17)))

# group by the bins
grouped = collisions_weather_selected.groupby(bins)

# get the count of collisions in each bin
counts = grouped.size()


# create 17 bins for the vsby column
bins_weather = pd.cut(weather.dropna(subset=["vsby"])['vsby'], bins=17, labels=range(17))

# group by the bins
grouped_weather = weather.groupby(bins_weather)

# get the count of collisions in each bin
counts_weather = grouped_weather.size()

In [None]:
# create a dataframe with counts and counts_weather
df = pd.DataFrame({'counts': counts, 'counts_weather': counts_weather})

# create a new column with the ratio of counts to counts_weather
df['ratio'] = df['counts'] / df['counts_weather']

df['visibility'] = df.index 

# create the bar chart
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('visibility:O', axis=alt.Axis(title='Visibility')),
    y=alt.Y('ratio:Q', axis=alt.Axis(title='Ratio of Collisions')),
).properties(
    width=400,
    height=300
)



# add the mean to the plot
mean_line = alt.Chart(df).mark_rule(color='red', strokeDash=[5,5]).encode(
    y='mean(ratio):Q'
)

chart + mean_line

It could be beneficial for the user to understand how far each data point is from the average value. To achieve this, we will calculate the z-score for each data point, which measures the number of standard deviations a particular data point is from the mean. By plotting the data using a divergent color scheme based on the z-scores, we can emphasize the more extreme values in the dataset. This visual representation will highlight instances where the data significantly deviates from the mean, providing a clearer insight into the distribution and identifying any outliers.

In [None]:
# calculate the mean and standard deviation of the ratio
mean_ratio = df['ratio'].mean()
std_ratio = df['ratio'].std()

# calculate the Z-Score for each data point
df['z_score'] = (df['ratio'] - mean_ratio) / std_ratio

# create the scatter plot
scatter = alt.Chart(df).mark_circle().encode(
    x=alt.X('visibility:O', axis=alt.Axis(title='Visibility')),
    y=alt.Y('z_score:Q', axis=alt.Axis(title='Z-Score of Ratio')),
    color=alt.Color('z_score:Q', scale=alt.Scale(scheme='purplegreen')),
    tooltip=['visibility', 'z_score']
).properties(
    width=400,
    height=300
)

# add the mean line to the plot
mean_line = alt.Chart(df).mark_rule(color='red', strokeDash=[5,5]).encode(
    y='mean(z_score):Q'
)

scatter + mean_line

What we can do is combine the two graphs, and we have the following solutions:

In [None]:
# create a dataframe with counts and counts_weather
df = pd.DataFrame({'counts': counts, 'counts_weather': counts_weather})

# create a new column with the ratio of counts to counts_weather
df['ratio'] = df['counts'] / df['counts_weather']

df['visibility'] = df.index 

mean_ratio = df['ratio'].mean()
std_ratio = df['ratio'].std()

df['pstd'] = mean_ratio + 2*std_ratio
df['nstd'] = mean_ratio - 2*std_ratio

# calculate the Z-Score for each data point
df['z_score'] = (df['ratio'] - mean_ratio) / std_ratio

# create the bar chart
bar = alt.Chart(df).mark_bar().encode(
    x=alt.X('visibility:O', axis=alt.Axis(title='Visibility')),
    y=alt.Y('ratio:Q', axis=alt.Axis(title='Ratio of Collisions')),
    color=alt.Color('z_score:Q', scale=alt.Scale(scheme='purplegreen'), legend = alt.Legend(title='Z-Score of Ratio'))
).properties(
    width=400,
    height=300
)

# create the bar chart
rule = alt.Chart(df).mark_rule().encode(
    x=alt.X('visibility:O', axis=alt.Axis(title='Visibility')),
    y=alt.Y('ratio:Q', axis=alt.Axis(title='Ratio of Collisions')),
    color=alt.Color('z_score:Q', scale=alt.Scale(scheme='purplegreen'), legend = None)
).properties(
    width=400,
    height=300
)

# create the bar chart
point = alt.Chart(df).mark_circle().encode(
    x=alt.X('visibility:O', axis=alt.Axis(title='Visibility')),
    y=alt.Y('ratio:Q', axis=alt.Axis(title='Ratio of Collisions')),
    color=alt.Color('z_score:O', scale=alt.Scale(scheme='purplegreen'), legend= None)
).properties(
    width=400,
    height=300
)

# add the mean to the plot
mean_line = alt.Chart(df).mark_rule(color='gray', strokeDash=[5,5]).encode(
    y='mean(ratio):Q'
)

pstd_line = alt.Chart(df).mark_rule(color='black', strokeDash=[5,5]).encode(
    y='pstd:Q'
)

nstd_line = alt.Chart(df).mark_rule(color='black', strokeDash=[5,5]).encode(
    y='nstd:Q'
)

(bar + mean_line + pstd_line + nstd_line) | (rule + point + mean_line + pstd_line + nstd_line)

The plot indicates a noticeable trend: as visibility decreases, the likelihood of an accident appears to increase. This observation is supported by the analysis of Z-Scores, which suggests that instances of extremely low visibility, represented by points around 1.5 standard deviations below the mean, are associated with a higher probability of accidents. In simpler terms, when visibility is severely reduced, the data suggests a greater chance of accidents occurring. This aligns with the common understanding that adverse weather conditions leading to poor visibility can contribute to an elevated risk of accidents.


Still, a doubt arises: why are the results less conclusive than expected? This is because visibility is closely related to humidity, and humidity, in turn, is related to temperature, which is highly influenced by the time of day. Therefore, we can imagine that the hours with lower visibility coincide with the hours when there are fewer accidents. This is something we will verify shortly.

In [None]:
# extract the hour from the DATE column
collisions_weather_selected['HOUR'] = collisions_weather_selected['DATE'].dt.hour

# group by hour and calculate the mean of the visibility column
mean_visibility = collisions_weather_selected.groupby('HOUR')['vsby'].mean()


# create a chart with the mean visibility by hour
visby_hour = alt.Chart(mean_visibility.reset_index()).mark_bar().encode(
    x=alt.X('HOUR:O', axis=alt.Axis(title='Hour')),
    y=alt.Y('vsby:Q', axis=alt.Axis(title='Mean Visibility')),
).properties(
    width=400,
    height=300
)

# add the mean line
mean_line = alt.Chart(mean_visibility.reset_index()).mark_rule(color='red', strokeDash=[5,5]).encode(
    y='mean(vsby):Q'
)

visby_hour + mean_line

We can see that between 6 and 7 a.m. are the hours with the least visibility, coinciding with hours with fewer collisions. This is not a causal relationship, meaning that lower visibility doesn't directly cause fewer accidents; that wouldn't make sense. Instead, the hours with lower visibility align with times when fewer people are driving, and therefore, there are fewer accidents. 

In [None]:
# create a new column with the visibility category
collisions_weather_selected['VISIBILITY CATEGORY'] = np.where(collisions_weather_selected['vsby'] > 16, 'High Visibility', 'Low Visibility')

# group by hour and visibility category and calculate the count of collisions
hourly_visibility = collisions_weather_selected.groupby(['HOUR', 'VISIBILITY CATEGORY']).size().reset_index(name='counts')


# calculate the total number of collisions per hour
hourly_total = hourly_visibility.groupby('HOUR')['counts'].sum().reset_index(name='total')

# merge the hourly_visibility and hourly_total dataframes
hourly_visibility = pd.merge(hourly_visibility, hourly_total, on='HOUR')

# calculate the percentage of low and high visibility collisions
hourly_visibility['percentage'] = hourly_visibility['counts'] / hourly_visibility['total'] * 100

# create the stacked bar chart
stacked_bar = alt.Chart(hourly_visibility).mark_bar().encode(
    x=alt.X('HOUR:O', axis=alt.Axis(title='Hour')),
    y=alt.Y('percentage:Q', axis=alt.Axis(title='Percentage of Collisions')),
    color=alt.Color('VISIBILITY CATEGORY:N', scale=alt.Scale(domain=['Low Visibility', 'High Visibility'], range=['#1f77b4', '#ff7f0e']), legend=alt.Legend(title='Visibility Category'))
).properties(
    width=400,
    height=300
)

stacked_bar

#### Rainfall

Lets now do the same with rainfall.

In [None]:

# select the rows where p01i is 0
zero_p01i = collisions_weather_selected.loc[collisions_weather_selected['p01i'] == 0]

# get the number of rows with p01i = 0
num_zero_p01i = len(zero_p01i)

# select the rows where p01i is not 0
nonzero_p01i = collisions_weather_selected.loc[collisions_weather_selected['p01i'] != 0]

# create 10 bins for the p01i column
bins = pd.cut(nonzero_p01i['p01i'], bins=10)

# get the midpoint of each interval
midpoints = bins.apply(lambda x: x.mid.round(2))

# group by the midpoints
grouped = nonzero_p01i.groupby(midpoints)

# convert the result of the groupby to a dataframe
grouped_df = grouped.size().reset_index(name='counts')

# create a new dataframe with the count of rows with p01i = 0
zero_row = pd.DataFrame({'p01i': [0], 'counts': [num_zero_p01i]})

counts = pd.merge(zero_row , grouped_df, on=['p01i', 'counts'], how="outer", indicator=False)

# select the rows where p01i is 0
zero_p01i_weather = weather.loc[weather['p01i'] == 0]

# get the number of rows with p01i = 0
num_zero_p01i_weather = len(zero_p01i_weather)

# select the rows where p01i is not 0
nonzero_p01i_weather = weather.loc[weather['p01i'] != 0]

# create 10 bins for the p01i column
bins_weather = pd.cut(nonzero_p01i_weather['p01i'], bins=10)

# get the midpoint of each interval
midpoints_weather = bins_weather.apply(lambda x: x.mid.round(2))

# group by the midpoints
grouped_weather = nonzero_p01i_weather.groupby(midpoints_weather)

# convert the result of the groupby to a dataframe
grouped_df_weather = grouped_weather.size().reset_index(name='counts')

# create a new dataframe with the count of rows with p01i = 0
zero_row_weather = pd.DataFrame({'p01i': [0], 'counts': [num_zero_p01i_weather]})

counts_weather= pd.merge(zero_row_weather, grouped_df_weather, on=['p01i', 'counts'], how="outer", indicator=False)

In [None]:
# create a dataframe with counts and counts_weather
df = pd.DataFrame({'p01i': counts['p01i'] ,'counts': counts['counts'], 'counts_weather': counts_weather['counts']})

# create a new column with the ratio of counts to counts_weather
df['ratio'] = df['counts'] / df['counts_weather']

mean_ratio = df['ratio'].mean()
std_ratio = df['ratio'].std()

df['mean'] = mean_ratio
df['pstd'] = mean_ratio + 2*std_ratio
df['nstd'] = mean_ratio - 2*std_ratio

# calculate the Z-Score for each data point
df['z_score'] = (df['ratio'] - mean_ratio) / std_ratio

df.fillna(0, inplace=True)

# create the bar chart
bar = alt.Chart(df).mark_bar().encode(
    x=alt.X('p01i:O', axis=alt.Axis(title='Rain level')),
    y=alt.Y('ratio:Q', axis=alt.Axis(title='Ratio of Collisions')),
    color=alt.Color('z_score:Q', scale=alt.Scale(scheme='purplegreen'), legend = alt.Legend(title='Z-Score of Ratio'))
).properties(
    width=400,
    height=300
)

# create the bar chart
rule = alt.Chart(df).mark_rule().encode(
    x=alt.X('p01i:O', axis=alt.Axis(title='Rain level')),
    y=alt.Y('ratio:Q', axis=alt.Axis(title='Ratio of Collisions')),
    color=alt.Color('z_score:Q', scale=alt.Scale(scheme='purplegreen'), legend = None)
).properties(
    width=400,
    height=300
)

# create the bar chart
point = alt.Chart(df).mark_circle().encode(
    x=alt.X('p01i:O', axis=alt.Axis(title='Rain Level')),
    y=alt.Y('ratio:Q', axis=alt.Axis(title='Ratio of Collisions')),
    color=alt.Color('z_score:O', scale=alt.Scale(scheme='purplegreen'), legend= None)
).properties(
    width=400,
    height=300
)

# add the mean to the plot
mean_line = alt.Chart(df).mark_rule(color='gray', strokeDash=[5,5]).encode(
    y='mean:Q'
)

pstd_line = alt.Chart(df).mark_rule(color='black', strokeDash=[5,5]).encode(
    y='pstd:Q'
)

nstd_line = alt.Chart(df).mark_rule(color='black', strokeDash=[5,5]).encode(
    y='nstd:Q'
)

(bar + mean_line + pstd_line + nstd_line) | (rule + point + mean_line + pstd_line + nstd_line)

The result we might expect is that the collision ratio increases as the rainfall level rises, and this is true up to values of 2.69 cm of rainfall. Except for the bar at 2.28, as it seems to be an outlier, indicating only one occurrence of rain in that interval. It is also surprising that the ratio decreases for 3.93, but again, this is an outlier that has only occurred once. As a solution, we could consider creating the following plot: raining vs. not raining.

In [None]:
# select the rows where p01i is 0
zero_p01i = collisions_weather_selected.loc[collisions_weather_selected['p01i'] == 0]

# get the number of rows with p01i = 0
num_zero_p01i = len(zero_p01i)

# select the rows where p01i is not 0
nonzero_p01i = collisions_weather_selected.loc[collisions_weather_selected['p01i'] != 0]

# get the number of rows with p01i = 0
num_nonzero_p01i = len(nonzero_p01i)

# create a new dataframe with the count of rows with p01i = 0
zero_row = pd.DataFrame({'p01i': [0], 'counts': [num_zero_p01i]})

non_zero_row = pd.DataFrame({'p01i': [1], 'counts': [num_nonzero_p01i]})

counts = pd.merge(zero_row , non_zero_row, on=['p01i', 'counts'], how="outer", indicator=False)

# select the rows where p01i is 0
zero_p01i_weather = weather.loc[weather['p01i'] == 0]

# get the number of rows with p01i = 0
num_zero_p01i_weather = len(zero_p01i_weather)

# select the rows where p01i is not 0
nonzero_p01i_weather = weather[weather['p01i'] != 0]

# get the number of rows with p01i = 0
num_nonzero_p01i_weather = len(nonzero_p01i_weather)

# create a new dataframe with the count of rows with p01i = 0
zero_row_weather = pd.DataFrame({'p01i': [0], 'counts': [num_zero_p01i_weather]})

non_zero_row_weather = pd.DataFrame({'p01i': [1], 'counts': [num_nonzero_p01i_weather]})

counts_weather = pd.merge(zero_row_weather , non_zero_row_weather, on=['p01i', 'counts'], how="outer", indicator=False)

print(counts)

In [None]:
# create a dataframe with counts and counts_weather
df = pd.DataFrame({'p01i': counts['p01i'] ,'counts': counts['counts'], 'counts_weather': counts_weather['counts']})

# create a new column with the ratio of counts to counts_weather
df['ratio'] = df['counts'] / df['counts_weather']

# create the bar chart
bar = alt.Chart(df).mark_bar().encode(
    x=alt.X('p01i:O', axis=alt.Axis(title='Rain level')),
    y=alt.Y('ratio:Q', axis=alt.Axis(title='Ratio of Collisions')),
).properties(
    width=400,
    height=300
)

bar 