In [1]:
#!/usr/bin/env python3
"""
select_top3_locations.py

Scan every raw Citibike CSV in CSV_DIR and print out the top-3 most
frequently used start stations (locations) by ride count.
"""

import os
import glob
import pandas as pd

# ──────────────────────────────────────────────────────────────────────────────
# CONFIGURATION — update to your local raw CSV folder
CSV_DIR = "/Users/manu/Desktop/cda_final/data/processed/raw_citibike_csvs"
TOP_K   = 3
# ──────────────────────────────────────────────────────────────────────────────

def select_top_k_start_stations(csv_dir: str, k: int) -> pd.Series:
    """
    Read every CSV in `csv_dir`, tally all 'start_station_name' values,
    and return a Series of the top-k station names and their counts.
    """
    counts = pd.Series(dtype=int)

    for path in glob.glob(os.path.join(csv_dir, "*.csv")):
        df = pd.read_csv(path, usecols=["start_station_name"])
        vc = df["start_station_name"].value_counts()
        counts = counts.add(vc, fill_value=0)

    counts = counts.astype(int)
    return counts.nlargest(k)

def main():
    top3 = select_top_k_start_stations(CSV_DIR, TOP_K)
    print(f"Top {TOP_K} start locations by ride count:\n")
    for station, cnt in top3.items():
        print(f"• {station}: {cnt:,} rides")

if __name__ == "__main__":
    main()

Top 3 start locations by ride count:

• W 21 St & 6 Ave: 139,932 rides
• Broadway & W 58 St: 115,744 rides
• West St & Chambers St: 110,814 rides
