In [None]:
import pandas as pd
import networkx as nx

df = pd.read_csv("../data/2021-04_cleaned.csv")

df = df.rename(columns={"Departure": "departure_time", "Return": "return_time", "Departure station id": "dep_id", "Departure station name": "dep_name",
    "Return station id": "ret_id", "Return station name": "ret_name", "Covered distance (m)": "distance_m", "Duration (sec.)": "duration_s"})

df = df[(df["distance_m"] > 0) & (df["duration_s"] > 0)]
df = df.dropna(subset=["dep_id", "ret_id"])

G = nx.DiGraph()

for _, row in df[["dep_id", "dep_name"]].drop_duplicates().iterrows():
    G.add_node(row["dep_id"], name=row["dep_name"])

for _, row in df[["ret_id", "ret_name"]].drop_duplicates().iterrows():
    G.add_node(row["ret_id"], name=row["ret_name"])

agg = df.groupby(["dep_id", "ret_id"]).agg(trip_count=("dep_id", "size"),
    total_distance=("distance_m", "sum"), total_duration=("duration_s", "sum")).reset_index()

for _, r in agg.iterrows():
    G.add_edge(r["dep_id"], r["ret_id"], weight=r["trip_count"],
               total_distance=r["total_distance"], total_duration=r["total_duration"])

print("Number of nodes (stations):", G.number_of_nodes())
print("Number of edges (station pairs):", G.number_of_edges())
print("Total trips:", df.shape[0])
print("Average trip distance (m):", df["distance_m"].mean())
print("Average trip duration (s):", df["duration_s"].mean())

in_deg = G.in_degree(weight="weight")
out_deg = G.out_degree(weight="weight")

top_inbound = sorted(in_deg, key=lambda x: x[1], reverse=True)[:5]
top_outbound = sorted(out_deg, key=lambda x: x[1], reverse=True)[:5]

print("\nTop inbound stations (by trips arriving):")
for node, val in top_inbound:
    print(G.nodes[node]["name"], ":", val)

print("\nTop outbound stations (by trips departing):")
for node, val in top_outbound:
    print(G.nodes[node]["name"], ":", val)

print("\nIs the graph strongly connected?", nx.is_strongly_connected(G))

FileNotFoundError: [Errno 2] No such file or directory: 'data/2021-04_cleaned.csv'

In [None]:
import matplotlib.pyplot as plt

def bike_network_map(df, year=None, top_edges=800):
    """
    Plot the directed bike network using station coords from the cleaned data.
    Uses weighted degree for node size and trip_count for edge thickness.
    Filters to a given year if provided and keeps only the top weighted edges for readability.
    """
    df = df.copy()
    df['departure_time'] = pd.to_datetime(df['departure_time'])
    df['return_time'] = pd.to_datetime(df['return_time'])
    if year is not None:
        df = df[df['departure_time'].dt.year == year]

    # Require coords
    df = df.dropna(subset=['Departure_lat', 'Departure_lon', 'Return_lat', 'Return_lon'])

    agg = (
        df.groupby(['dep_id', 'ret_id']).agg(
            trip_count=('dep_id', 'size'),
            total_distance=('distance_m', 'sum'),
            total_duration=('duration_s', 'sum'),
        ).reset_index()
        .sort_values('trip_count', ascending=False)
        .head(top_edges)
    )

    G_map = nx.DiGraph()

    for _, row in df[['dep_id', 'dep_name', 'Departure_lat', 'Departure_lon']].drop_duplicates(subset=['dep_id']).iterrows():
        G_map.add_node(row['dep_id'], name=row['dep_name'], pos=(row['Departure_lon'], row['Departure_lat']))
    for _, row in df[['ret_id', 'ret_name', 'Return_lat', 'Return_lon']].drop_duplicates(subset=['ret_id']).iterrows():
        G_map.add_node(row['ret_id'], name=row['ret_name'], pos=(row['Return_lon'], row['Return_lat']))

    for _, r in agg.iterrows():
        G_map.add_edge(
            r['dep_id'],
            r['ret_id'],
            weight=r['trip_count'],
            total_distance=r['total_distance'],
            total_duration=r['total_duration'],
        )

    pos = nx.get_node_attributes(G_map, 'pos')
    deg_weight = dict(G_map.degree(weight='weight'))
    node_sizes = [deg_weight[n] * 0.05 for n in G_map.nodes()]  # adjust scale if needed
    edge_widths = [(d['weight']) ** 0.4 for _, _, d in G_map.edges(data=True)]

    fig, ax = plt.subplots(figsize=(12, 12))
    nx.draw_networkx_nodes(G_map, pos, node_size=node_sizes, node_color='#4c78a8', alpha=0.7, ax=ax)
    nx.draw_networkx_edges(
        G_map,
        pos,
        width=edge_widths,
        edge_color='grey',
        alpha=0.3,
        arrows=True,
        arrowsize=6,
        connectionstyle='arc3,rad=0.05',
        ax=ax,
    )

    top_nodes = sorted(deg_weight.items(), key=lambda x: x[1], reverse=True)[:25]
    labels = {n: G_map.nodes[n]['name'] for n, _ in top_nodes}
    nx.draw_networkx_labels(G_map, pos, labels=labels, font_size=7, ax=ax)

    ax.axis('off')
    ax.set_title('Directed bike network (top flows)')
    plt.show()
    return G_map

# Example usage:
G_map = bike_network_map(df, year=2021, top_edges=800)


In [None]:
import folium


def map_plot(df, year=None):
    """
    Interactive bubble map of departure stations sized by trip frequency.
    Uses station coordinates from the cleaned data (no external fetch).
    """
    df = df.copy()
    df['departure_time'] = pd.to_datetime(df['departure_time'])
    if year is not None:
        df = df[df['departure_time'].dt.year == year]

    df = df.dropna(subset=['Departure_lat', 'Departure_lon'])

    freq = (
        df.groupby(['dep_id', 'dep_name', 'Departure_lat', 'Departure_lon'])
          .size()
          .reset_index(name='freq')
    )
    if freq.empty:
        raise ValueError('No data after filtering; check year or input data')

    median_freq = freq['freq'].median() or 1
    freq['freq_adj'] = freq['freq'] / (median_freq / 3)

    center_lat = freq['Departure_lat'].mean()
    center_lon = freq['Departure_lon'].mean()
    hel_map = folium.Map([center_lat, center_lon], zoom_start=12, tiles='cartodbpositron')

    for _, row in freq.iterrows():
        folium.CircleMarker(
            [row['Departure_lat'], row['Departure_lon']],
            radius=row['freq_adj'],
            popup=f"{row['dep_name']} ({int(row['freq'])} trips)",
            color="#eab544",
            fill=True,
            fill_opacity=0.7,
            fill_color="#eab544",
        ).add_to(hel_map)

    display(hel_map)
    return hel_map

# Example usage:
# map_plot(df, year=2021)
