In [9]:
import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS 
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path

In [10]:
DATA_DIR = Path(r"C:\Users\nikik\citibike_tripdata\citibike-dashboard\2022-citibike-tripdata")

In [11]:
csv_files = list(DATA_DIR.rglob("*.csv"))
len(csv_files)

38

In [22]:
usecols = ["started_at","start_station_name","start_lat","start_lng","end_station_name","end_lat","end_lng","member_casual"]

TARGET_N = 150_000   
CHUNKSIZE = 200_000 
take_per_chunk = 1_000

samples = []
n = 0

for f in csv_files:
    for chunk in pd.read_csv(
        f,
        usecols=lambda c: c in usecols,
        chunksize=CHUNKSIZE,
        low_memory=False
    ):
        # drop rows missing start OR end station OR time
        chunk = chunk.dropna(subset=["start_station_name", "end_station_name", "started_at"])
        if chunk.empty:
            continue

        k = min(take_per_chunk, len(chunk), TARGET_N - n)
        if k <= 0:
            break

        samples.append(chunk.sample(n=k, random_state=42))
        n += k

        if n >= TARGET_N:
            break

    if n >= TARGET_N:
        break

bike_df = pd.concat(samples, ignore_index=True)

In [23]:
bike_df.head()

Unnamed: 0,started_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
0,2022-01-17 05:45:12.848,W 100 St & Manhattan Ave,Broadway & W 122 St,40.795,-73.9645,40.812056,-73.961255,member
1,2022-01-14 12:52:57.277,E 67 St & Park Ave,E 59 St & Madison Ave,40.767801,-73.965921,40.763505,-73.971092,member
2,2022-01-10 17:08:24.611,Bergen St & Vanderbilt Ave,4 Ave & 3 St,40.679439,-73.968044,40.673746,-73.985649,casual
3,2022-01-20 09:29:10.265,6 Ave & W 33 St,W 15 St & 6 Ave,40.749013,-73.988484,40.738046,-73.99643,member
4,2022-01-18 09:19:59.891,W 56 St & 10 Ave,Cathedral Pkwy & Broadway,40.768254,-73.988639,40.804213,-73.966991,member


In [24]:
bike_df.columns

Index(['started_at', 'start_station_name', 'end_station_name', 'start_lat',
       'start_lng', 'end_lat', 'end_lng', 'member_casual'],
      dtype='str')

In [25]:
bike_df[["start_station_name","end_station_name"]].head()

Unnamed: 0,start_station_name,end_station_name
0,W 100 St & Manhattan Ave,Broadway & W 122 St
1,E 67 St & Park Ave,E 59 St & Madison Ave
2,Bergen St & Vanderbilt Ave,4 Ave & 3 St
3,6 Ave & W 33 St,W 15 St & 6 Ave
4,W 56 St & 10 Ave,Cathedral Pkwy & Broadway


In [26]:
bike_df["value"] = 1

In [27]:
df_group = (bike_df.groupby(["start_station_name","start_lat","start_lng","end_station_name", "end_lat", "end_lng" ])["value"].count().reset_index().rename(columns={"value": "trips"}))

In [28]:
df_group.head()

Unnamed: 0,start_station_name,start_lat,start_lng,end_station_name,end_lat,end_lng,trips
0,1 Ave & E 110 St,40.792217,-73.937957,2 Ave & E 96 St,40.783964,-73.947167,1
1,1 Ave & E 110 St,40.792271,-73.938198,E 93 St & 2 Ave,40.782454,-73.94892,1
2,1 Ave & E 110 St,40.792272,-73.938151,Pleasant Ave & E 116 St,40.794988,-73.933335,1
3,1 Ave & E 110 St,40.792286,-73.938152,W 116 St & Amsterdam Ave,40.806758,-73.960708,1
4,1 Ave & E 110 St,40.792305,-73.9382,Lenox Ave & W 111 St,40.798786,-73.9523,1


In [29]:
df_group["trips"].sum(), bike_df.shape[0]

(150000, 150000)

In [30]:
m = KeplerGl(height=700,data={"bike_trips": df_group})

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [31]:
m

KeplerGl(data={'bike_trips':            start_station_name  start_lat  start_lng          end_station_name  \
…

# Map customization and design choices

The station point layer was styled using a single neutral color to clearly display station locations without distracting from overall trip patterns. An arc layer connecting starting and ending stations was enabled to visualize the flow of bike trips across the city. The arc colors were adjusted using a sequential color palette so that frequently used routes stand out more prominently than less common trips.

# Interpretation
After applying a filter to highlight only the most frequent trips, the map shows that bike traffic is heavily concentrated in central areas of New York City, particularly in Manhattan and parts of Brooklyn. These areas appear especially busy, with repeated trips occurring between nearby stations. This pattern is consistent with New York City’s high population density and the presence of major employment centers, transit hubs, and tourist attractions. Areas such as Midtown and Lower Manhattan attract commuters for short “last-mile” travel, while nearby recreational and tourist locations further increase bike usage. In contrast, outer areas of the city show far fewer high-frequency routes, indicating lower bike-sharing demand and a less dense station network in those zones. Overall, the filtered map suggests that bike-sharing demand is strongest in dense, mixed-use neighborhoods where commuting, tourism, and leisure activities overlap.

In [32]:
config = m.config

In [33]:
type(config)

dict

In [34]:
m.save_to_html(file_name="NYC_BikeTrips_Kepler.html",read_only=False,config=config)

Map saved to NYC_BikeTrips_Kepler.html!


In [35]:
os.getcwd()

'C:\\Users\\nikik\\citibike_tripdata\\citibike-dashboard'

In [36]:
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 '2022-citibike-tripdata',
 'bike_sample.csv',
 'Ex_2.2_project_planning.ipynb',
 'Ex_2.3_Fundamentals.ipynb',
 'Ex_2.4.ipynb',
 'Ex_2.5.ipynb',
 'NYC_BikeTrips_Kepler.html',
 'README.md']