# Mobi Vancouver Bike Share Data

Download and process complete history (2018-2025, ~7.6M trips).

Works on **Databricks** or locally.

In [0]:
%pip install requests pandas pyarrow beautifulsoup4 openpyxl mlflow markdownify

In [0]:
%restart_python

%reload_ext autoreload
%autoreload 2

In [0]:
import sys
from pathlib import Path

# Add the repository's src directory using a relative path
src_path = Path.cwd() / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

In [0]:
import re
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup
import mlflow

from src.mobi import fetch_station_info_from_gbfs, fetch_station_status_from_gbfs, BasicSiteScraper, download_all_trip_data, combine_trip_data, save_to_parquet

In [0]:
config = mlflow.models.ModelConfig(development_config='config.yaml')
dbutils.widgets.text('catalog', config.get('catalog'))
dbutils.widgets.text('schema', config.get('schema'))

data_dir = Path("./data")

## Download Trip Data

In [0]:
# Download all files
trip_data_dir = data_dir / "trip_data"  # Databricks
# output_dir = Path("./data")  # Local

print("Downloading trip data...")
files = download_all_trip_data(trip_data_dir / "raw")
print(f"Downloaded {len(files)} files")

# Process and combine
print("\nProcessing...")
trips_df = combine_trip_data(files)
print(f"Total trips: {len(trips_df):,}")

# Save
save_to_parquet(trips_df, trip_data_dir / "mobi_trips.parquet")

## Download Station Data

In [0]:
# Fetch from GBFS API
print("Fetching stations...")
stations = fetch_station_info_from_gbfs()
status = fetch_station_status_from_gbfs()

# Combine
stations = stations.merge(status, on="station_id", how="left")
print(f"Total stations: {len(stations)}")

# Save
stations.to_parquet(trip_data_dir / "mobi_stations.parquet", index=False)
stations.to_csv(trip_data_dir / "mobi_stations.csv", index=False)

## Scrape mobi Webiste

In [0]:
from mobi import BasicSiteScraper

# Configure scraper
BASE_URL = "https://www.mobibikes.ca/en/"
START_URL = BASE_URL

scraper = BasicSiteScraper(
    base_url=BASE_URL,
    delay=0.75,      # seconds between requests
    max_depth=1,     # 0 = only start page; 1 = start + its links
)

# Run the scrape
pages = scraper.scrape_recursive(START_URL)

print(f"Pages scraped: {len(pages)}")
# Preview a few URLs
list(pages.keys())[:10]


In [0]:
# Save markdown content per page
from pathlib import Path
from urllib.parse import urlparse
import re

site_data_dir = Path("data/mobi_site/")
raw_site_data_dir = site_data_dir / "raw"
raw_site_data_dir.mkdir(parents=True, exist_ok=True)  # Ensure 'raw' directory exists


def url_to_filename(url: str) -> str:
    parsed = urlparse(url)
    path = parsed.path.rstrip("/") or "index"
    safe = re.sub(r"[^a-zA-Z0-9._-]+", "_", path)
    if not safe.endswith(".md"):
        safe += ".md"
    return safe

count = 0
for url, data in pages.items():
    filepath = raw_site_data_dir / url_to_filename(url)
    filepath.write_text(data.get("content", ""), encoding="utf-8")
    count += 1

print(f"Wrote {count} markdown files to {raw_site_data_dir.resolve()}")

# Save to parquet
records = []
for url, data in pages.items():
    md = data.get("metadata", {})
    records.append({
        "url": url,
        "title": md.get("title", ""),
        "description": md.get("description", ""),
        "main_heading": md.get("main_heading", ""),
        "scraped_at": md.get("scraped_at", None),
        "status": data.get("status", ""),
        "content_md": data.get("content", ""),
    })

df = pd.DataFrame.from_records(records)
output_path = site_data_dir / "mobibikes_ca_content.parquet"
save_to_parquet(df, output_path)

## Load and Explore

In [0]:
# Load
trips = pd.read_parquet(trip_data_dir / "mobi_trips.parquet")
stations = pd.read_parquet(trip_data_dir / "mobi_stations.parquet")
mobi_site = pd.read_parquet(site_data_dir / "mobibikes_ca_content.parquet")

print(f"Trips: {len(trips):,}")
print(f"Stations: {len(stations)}")
print(f"Date range: {trips['departure_time'].min()} to {trips['departure_time'].max()}")
print(f"Site pages scraped: {len(mobi_site)}")

## Write out bronze tables

In [0]:
catalog = dbutils.widgets.get('catalog')
schema = dbutils.widgets.get('schema')

tables = [
    ("trips", trips),
    ("stations", stations),
    ("site", mobi_site)
]

def clean_column(col):
    # Replace invalid characters with underscore
    return re.sub(r"[ ,;{}()\n\t=.`]", "_", col.lower())

for name, table in tables:
    spark_table = spark.createDataFrame(table)
    # Clean column names
    for col in spark_table.columns:
        spark_table = spark_table.withColumnRenamed(col, clean_column(col))
    (
        spark_table.write
        .mode("overwrite")
        .option("overwriteSchema", True)
        .saveAsTable(f"`{catalog}`.`{schema}`.`bronze_{name}`")
    )
    del spark_table