In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. Load and process BIKE data
bike_df = pd.read_csv("data/rides.csv", sep=",")
bike_df["date"] = pd.to_datetime(bike_df["date"])
bike_yearly = bike_df.groupby(bike_df["date"].dt.year)["n"].sum().reset_index()
bike_yearly.columns = ["year", "bike_count"]

# 2. Load and process ROAD data (Parquet format)
road_df = pd.read_parquet("data/traffic_data.parquet")
road_df.index = pd.to_datetime(road_df.index)

# Get numeric traffic columns and clean them
traffic_cols = road_df.select_dtypes(include=["number"]).columns
road_df[traffic_cols] = road_df[traffic_cols].replace(-1, pd.NA)
road_yearly = road_df[traffic_cols].sum(axis=1).resample('YE').sum().reset_index()
road_yearly.columns = ["year", "road_count"]

# Convert road_yearly's 'year' column to int for matching data type with bike_yearly
road_yearly["year"] = road_yearly["year"].dt.year

# 3. Merge and calculate shares
traffic_df = pd.merge(bike_yearly, road_yearly, on="year", how="inner")
traffic_df["total"] = traffic_df["bike_count"] + traffic_df["road_count"]
traffic_df["bike_share"] = traffic_df["bike_count"] / traffic_df["total"] * 100  # Multiply by 100 for percentage
traffic_df["road_share"] = traffic_df["road_count"] / traffic_df["total"] * 100  # Multiply by 100 for percentage

# Drop any rows with missing data
traffic_df = traffic_df.dropna(subset=["bike_share", "road_share"])

# 4. Smooth the shares using a rolling window (for annual data, a window of 2 years is typically smooth)
traffic_df["bike_share"] = traffic_df["bike_share"].rolling(window=2, center=True).mean()
traffic_df["road_share"] = traffic_df["road_share"].rolling(window=2, center=True).mean()

# 5. Plot
plt.figure(figsize=(14, 6))
plt.stackplot(
    traffic_df["year"],
    [traffic_df["road_share"], traffic_df["bike_share"]],
    labels=["Road Traffic", "Bike Traffic"],
    colors=["#d62728", "#2ca02c"],
    alpha=0.85,
    edgecolor="black"  # Highlight the separating line between the two areas
)
plt.title("Proportional Share of Road vs Bike Traffic in Copenhagen Over Time (Yearly)", fontsize=16)
plt.ylabel("Proportion of Total Traffic (%)")  # Update y-axis label to show percentage
plt.xlabel("Year")
plt.legend(loc="upper left")

# Format y-axis as percentages
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}%'))

# Adjust y-axis limits from 50% to 100%
plt.ylim(50, 100)

plt.tight_layout()
plt.show()


**Shift in Traffic Trends: Cars to Bikes in Copenhagen (2005–2014)**

The stacked area chart illustrates the proportional share of road traffic (motor vehicles) versus bike traffic in Copenhagen over a 10-year period, from 2005 to 2014. The red area represents road traffic, while the green area represents bike traffic, shown as a percentage of the total traffic volume each year.

From the visualization, a clear trend emerges: road traffic has steadily decreased in proportion, while bike traffic has gained ground. In 2006, road traffic accounted for the overwhelming majority of total traffic—over 90%. However, by 2014, its share had dropped to just over 80%, with bike traffic growing correspondingly.

This shift suggests a gradual but meaningful transition in urban mobility habits in Copenhagen, aligning with the city’s efforts to promote sustainable transportation. Investments in bike infrastructure, such as dedicated lanes and bike-friendly urban planning, likely contributed to this change. Although road traffic remains dominant, the consistent growth in bike usage indicates a cultural and infrastructural shift towards more environmentally friendly and health-conscious commuting options.

Overall, this data highlights how policy and infrastructure changes can influence transportation behavior, making Copenhagen a model for sustainable urban transit development.

In [None]:
import pandas as pd
from bokeh.io import show, output_file
from bokeh.layouts import column
from bokeh.models import Select, ColumnDataSource, CustomJS, HoverTool, FixedTicker
from bokeh.plotting import figure
from bokeh.palettes import Spectral11

# Load dataset
acidentes = pd.read_csv("data/acidentes.csv")

# Days of the week
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Initial source (Monday)
source = ColumnDataSource(data={
    'x': acidentes['Hour'],
    'y': acidentes['Monday']
})

# Full dataset for JS callback
all_data = {day: acidentes[day].tolist() for day in days_of_week}
all_data['Hour'] = acidentes['Hour'].tolist()

# Create figure with X-axis from 0 to 23 and all ticks
p = figure(title="Number of Accidents Throughout the Day (Monday)",
           x_axis_label="Hour of Day", y_axis_label="Number of Accidents",
           x_range=(0, 23), y_range=(0, max(acidentes[days_of_week].max()) + 50),
           height=400, width=800, tools="pan,wheel_zoom,box_zoom,reset")

# Show every hour on X-axis
p.xaxis.ticker = FixedTicker(ticks=list(range(24)))
p.xaxis.major_label_orientation = 1  # vertical labels if needed

# Add line and hover
line = p.line(x='x', y='y', source=source, line_width=3, line_color=Spectral11[0])
hover = HoverTool(tooltips=[("Hour", "@x"), ("Accidents", "@y")], mode='vline')
p.add_tools(hover)

# Dropdown
select = Select(title="Select Day of the Week", value="Monday", options=days_of_week)

# JS callback
callback = CustomJS(args=dict(source=source, full_data=all_data, select=select, plot=p, line=line), code="""
    const day = select.value;
    source.data['y'] = full_data[day];
    source.change.emit();
    plot.title.text = "Number of Accidents Throughout the Day (" + day + ")";
    line.legend_label = day;
""")

select.js_on_change('value', callback)

# Layout and show
layout = column(select, p)

output_file("accidents_by_hour.html")
show(layout)


**Daily Accident Patterns by Day of the Week in Copenhagen**

The interactive line chart displays the number of traffic accidents in Copenhagen across different hours of the day, with the ability to select each day of the week. This visualization reveals distinct daily traffic patterns that reflect human behavior, commuting routines, and differences between weekdays and weekends.

A consistent trend can be observed on weekdays (Monday to Friday): accident numbers spike during morning (around 7–9 AM) and evening rush hours (around 3–6 PM). These peaks correlate with times when people commute to and from work or school, resulting in increased road activity and a higher likelihood of accidents.

In contrast, weekends (Saturday and Sunday) show a significantly lower volume of accidents throughout the day. This is likely due to reduced traffic as fewer people commute for work. On weekends, the peaks—though smaller—tend to occur in the afternoon hours, suggesting more recreational or leisure travel rather than routine commutes.

This pattern highlights the impact of urban activity and human mobility on road safety. Rush hours during the workweek present higher risks, emphasizing the need for targeted traffic management, road safety campaigns, and perhaps infrastructure adjustments to reduce accident occurrences during these periods.



In [None]:
import pyarrow.parquet as pq
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# === Load Parquet File ===
df = pq.read_table('traffic_data.parquet').to_pandas()

# Bucket rainfall into categories
def categorize_rain(rain):
    if rain == 0:
        return "No Rain"
    elif rain <= 0.5:
        return "Light Rain"
    elif rain <= 2:
        return "Moderate Rain"
    else:
        return "Heavy Rain"

# Categorizing rainfall
df["rainfall_category"] = df["rain (mm)"].apply(categorize_rain)

# Use `.loc[]` to filter and avoid creating a new copy
traffic_df = df.loc[df["rainfall_category"] == "Heavy Rain"].copy()  # Using `.copy()` to avoid view issues

# === Load Accident Dataset ===
accidents_df = pd.read_csv("acidentes.csv")

# Convert 'datetime' index (if applicable) for clarity
traffic_df.index.name = "datetime"

# === Extract Hour from datetime index ===
traffic_df["Hour"] = traffic_df.index.hour

# === Ensure all vehicle count columns are numeric ===
traffic_df[traffic_df.columns] = traffic_df[traffic_df.columns].apply(pd.to_numeric, errors='coerce')

# === Identify vehicle count columns ===
vehicle_columns = traffic_df.columns

# Sum across vehicle columns per row
traffic_df["vehicle_count"] = traffic_df[vehicle_columns].sum(axis=1)

# === Group vehicle counts by Hour ===
vehicles_by_hour = traffic_df.groupby("Hour")["vehicle_count"].sum().reset_index()

# === Merge with Accidents Data ===
merged = pd.merge(accidents_df, vehicles_by_hour, on="Hour", how="inner")

# === Normalize Data Using .loc to Avoid SettingWithCopyWarning ===
merged.loc[:, "normalized_accidents"] = merged["Total"] / merged["Total"].max()
merged.loc[:, "vehicle_count_normalized"] = merged["vehicle_count"] / merged["vehicle_count"].max()

# === Prepare Data for Heatmap ===
heatmap_data = merged.pivot_table(
    index="Hour",  # Rows: hours
    values="normalized_accidents",  # Values to be plotted
    aggfunc="mean"  # Mean for each hour if there are multiple entries
)

# === Plot Heatmap ===
plt.figure(figsize=(16, 2))
sns.heatmap(
    heatmap_data.T,  # Transpose to have hours along the x-axis
    cmap="coolwarm",
    annot=True,
    fmt=".2f",  # Display with 2 decimals
    cbar_kws={"label": "Normalized Accidents"}
)

plt.title("Normalized Hourly Accident Rate During Heavy Rain")
plt.xlabel("Hour of Day")
plt.ylabel("Hour of Day")
plt.tight_layout()
plt.show()


The heatmap illustrates the normalized accident rate per hour during heavy rain conditions, adjusted relative to the number of vehicles on the road. This means it highlights not just when most accidents happen, but when the risk of accidents is highest in proportion to traffic volume.

Peak risk hours occur in the afternoon between 14:00 and 17:00, with the normalized accident rate reaching the maximum value of 1.00. This indicates that even though traffic may be high during these hours, accidents occur disproportionately more often during heavy rain.

Morning rush hours (7:00 to 9:00) also show elevated accident risk, with normalized values between 0.57 and 0.73. Rain significantly increases the chances of accidents during these busy commute hours.

The lowest accident risks are seen in the early morning (0:00 to 5:00) and late evening (20:00 to 23:00), where normalized values drop below 0.35. These times naturally have less traffic, and the impact of rain appears to be less severe.

The data reveals how heavy rain disproportionately increases the likelihood of traffic accidents during already busy periods, especially in the afternoon. Even if the number of cars is not at its highest, wet conditions make driving more hazardous during these hours.