# Importing libraries

In [1]:
import pandas as pd
import numpy as np

from keplergl import KeplerGl

# Loading the data 

In [2]:
df = pd.read_csv("data/processed/citibike_weather_2022.csv")
df.head()

  df = pd.read_csv("data/processed/citibike_weather_2022.csv")


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,PRCP,TMAX,TMIN
0,0CC1D7F53FA21F9A,electric_bike,2022-11-02 05:54:11.481,2022-11-02 06:08:27.471,Park Ave & E 162 St,8016.07,Jerome Ave & W 193 St,8619.02,40.825701,-73.915644,40.86659,-73.89794,casual,2022-11-02,0.0,21.7,13.9
1,EDAECDAE6BF903DE,classic_bike,2022-11-02 18:20:15.611,2022-11-02 19:00:24.787,Broadway & W 61 St,7014.12,Leonard St & Church St,5359.11,40.77003,-73.981968,40.717571,-74.005549,casual,2022-11-02,0.0,21.7,13.9
2,37C06FED49069B80,electric_bike,2022-11-04 18:39:39.873,2022-11-04 18:44:01.851,W 54 St & 11 Ave,6955.05,11 Ave & W 59 St,7059.01,40.768333,-73.992573,40.771497,-73.99046,member,2022-11-04,0.0,22.2,11.1
3,63751973E9A95FB1,classic_bike,2022-11-09 18:02:29.616,2022-11-09 18:19:28.693,Broadway & W 41 St,6560.01,11 Ave & W 59 St,7059.01,40.755136,-73.98658,40.771497,-73.99046,member,2022-11-09,0.0,12.2,5.0
4,F7410DEDF925FBA8,electric_bike,2022-11-12 10:23:11.805,2022-11-12 10:28:24.794,William St & Pine St,5065.12,Leonard St & Church St,5359.11,40.707317,-74.008854,40.717571,-74.005549,member,2022-11-12,2.3,20.6,16.1


# Creating column “1” and aggregate station-to-station counts

In [3]:
df["trip"] = 1

# Building the aggregated dataframe (3 columns)

In [4]:
trips_od = (df.groupby(["start_station_name", "end_station_name"], dropna=False)["trip"]
      .sum()
      .reset_index()
      .rename(columns={"trip": "trip_count"}))

trips_od.sort_values("trip_count", ascending=False).head(10)

Unnamed: 0,start_station_name,end_station_name,trip_count
295518,Central Park S & 6 Ave,Central Park S & 6 Ave,12041
148062,7 Ave & Central Park South,7 Ave & Central Park South,8541
783627,Roosevelt Island Tramway,Roosevelt Island Tramway,8213
549131,Grand Army Plaza & Central Park S,Grand Army Plaza & Central Park S,7287
801875,Soissons Landing,Soissons Landing,7275
898485,W 21 St & 6 Ave,9 Ave & W 22 St,6345
119554,5 Ave & E 72 St,5 Ave & E 72 St,6037
6546,1 Ave & E 62 St,1 Ave & E 68 St,5826
1015111,Yankee Ferry Terminal,Yankee Ferry Terminal,5759
255565,Broadway & W 58 St,Broadway & W 58 St,5509


In [5]:
# Filtering out missing station names 

trips_od = trips_od.dropna(subset=["start_station_name", "end_station_name"])

# Comments 

take a bike → ride around the park → return it to the SAME dock.

Those are round-trips tipical for pure leisure activities.

# Removing round- trips 

In [6]:
trips_arc = (
    df.groupby(
        ["start_station_name", "end_station_name",
         "start_lat", "start_lng", "end_lat", "end_lng"],
        dropna=False
    )["trip"]
    .sum()
    .reset_index()
    .rename(columns={"trip": "trip_count"})
    .dropna(subset=["start_lat", "start_lng", "end_lat", "end_lng"])
)

trips_arc = trips_arc[trips_arc["start_station_name"] != trips_arc["end_station_name"]]

trips_arc.sort_values("trip_count", ascending=False).head(10)

Unnamed: 0,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,trip_count
4293565,W 21 St & 6 Ave,9 Ave & W 22 St,40.74174,-73.994156,40.745497,-74.001971,5987
41612,1 Ave & E 62 St,1 Ave & E 68 St,40.761227,-73.96094,40.765005,-73.958185,5479
3485478,Norfolk St & Broome St,Henry St & Grand St,40.717227,-73.988021,40.714211,-73.981095,4489
4935290,West St & Chambers St,Pier 40 - Hudson River Park,40.717548,-74.013221,40.727714,-74.011296,4411
3496145,North Moore St & Greenwich St,Vesey St & Church St,40.720195,-74.010301,40.71222,-74.010472,4405
4301753,W 21 St & 6 Ave,W 22 St & 10 Ave,40.74174,-73.994156,40.74692,-74.004519,4052
5037544,Yankee Ferry Terminal,Soissons Landing,40.687066,-74.016756,40.692317,-74.014866,4006
3823373,Soissons Landing,Yankee Ferry Terminal,40.692317,-74.014866,40.687066,-74.016756,4004
3576151,Pier 40 - Hudson River Park,West St & Chambers St,40.727714,-74.011296,40.717548,-74.013221,4004
2882358,Henry St & Grand St,Norfolk St & Broome St,40.714211,-73.981095,40.717227,-73.988021,3978


# Comments

The most frequent trips initially appeared to occur between identical stations. This revealed that a large portion of CitiBike usage consists of recreational round-trips rather than transportation. After filtering out same-station rides, the visualization highlighted directional flows across Manhattan, indicating commuting and cross-neighborhood mobility patterns.

# Initializing kepler.gl map

In [7]:
trips_arc = trips_arc.replace([np.inf, -np.inf], np.nan)

# drop rows with missing or invalid coords
trips_arc = trips_arc.dropna(subset=["start_lat", "start_lng", "end_lat", "end_lng"])

# sanity: NYC coords should be around lat ~40, lng ~ -74
trips_arc = trips_arc[
    trips_arc["start_lat"].between(40, 41) &
    trips_arc["end_lat"].between(40, 41) &
    trips_arc["start_lng"].between(-75, -73) &
    trips_arc["end_lng"].between(-75, -73)]

In [8]:
# reducing dataset before seding to krepler

trips_arc_small = trips_arc.nlargest(3000, "trip_count").copy()
trips_arc_small.shape

(3000, 7)

In [9]:
trips_arc_small_kepler = trips_arc_small.rename(columns={
    "start_lat": "start_latitude",
    "start_lng": "start_longitude",
    "end_lat": "end_latitude",
    "end_lng": "end_longitude"
})

In [10]:
trips_arc_small["trip_count"] = (
    trips_arc_small["trip_count"]
    .astype(float)
    .round(0)
    .astype(int))

In [11]:
trips_arc_small["trip_count"].describe()

count    3000.00000
mean     1011.39500
std       496.39095
min       633.00000
25%       717.00000
50%       844.50000
75%      1106.00000
max      5987.00000
Name: trip_count, dtype: float64

In [21]:
map_1 = KeplerGl(height=650)
map_1.add_data(data=trips_arc_small_kepler, name="Trips OD")
map_1

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'Trips OD': {'index': [1394110, 3738268, 632921, 3823191, 4293565, 2706068, 41612, 502907, 1474…

# Force trip_count to a clean integer

In [14]:
trips_arc_small["trip_count"] = trips_arc_small["trip_count"].round(0).astype(int)
trips_arc_small["trip_count"].describe()

count    3000.00000
mean     1011.39500
std       496.39095
min       633.00000
25%       717.00000
50%       844.50000
75%      1106.00000
max      5987.00000
Name: trip_count, dtype: float64

In [15]:
# 1) Force a real copy 
trips_arc_small = trips_arc_small.copy()

# 2) Convert
trips_arc_small["trip_count"] = trips_arc_small["trip_count"].round(0).astype("int64")

# 3) PROVE dtype 
print("dtype:", trips_arc_small["trip_count"].dtype)

# 4) Prove values are integers
print(trips_arc_small["trip_count"].head(10).tolist())

# 5) stats
trips_arc_small["trip_count"].describe()

dtype: int64
[5987, 5479, 4489, 4411, 4405, 4052, 4006, 4004, 4004, 3978]


count    3000.00000
mean     1011.39500
std       496.39095
min       633.00000
25%       717.00000
50%       844.50000
75%      1106.00000
max      5987.00000
Name: trip_count, dtype: float64

# Rebuilding the Kepler map fresh

In [16]:
trips_strong = trips_arc_small[trips_arc_small["trip_count"] >= 3000]

len(trips_strong)

40

# Clean Map

In [19]:
import pandas as pd
from keplergl import KeplerGl

# sanity check
assert isinstance(trips_strong, pd.DataFrame)

map_clean = KeplerGl(height=650, data={"Trips OD": trips_strong})

map_clean

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'Trips OD':                     start_station_name               end_station_name  \
4293565   …

# Comments

After filtering the data to retain only the highest trip frequencies, the network changes from a dense web into a small number of repeated connections. The strongest routes are short and geographically concentrated, mostly within central Manhattan and along waterfront areas. This indicates that high-frequency CitiBike usage is primarily local mobility rather than long commuting distances. The same station pairs appear repeatedly, suggesting habitual patterns such as commuting between nearby neighborhoods, parks, and transit hubs.

In [25]:
import os
import json
from keplergl import KeplerGl

# 1 — create output folder
os.makedirs("outputs", exist_ok=True)

# 2 — save the configuration of the map 
with open("outputs/task_2_5_kepler_config.json", "w") as f:
    json.dump(map_clean.config, f, indent=2)

# 3 — create a CLEAN export map (no widget memory)
export_map = KeplerGl(
    height=650,
    data={"Trips OD": trips_strong},
    config=map_clean.config)

# 4 — export to HTML
export_map.save_to_html(
    file_name="outputs/task_2_5_kepler_map.html",
    read_only=True
)

print("EXPORT FINISHED")

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to outputs/task_2_5_kepler_map.html!
EXPORT FINISHED


In [26]:
os.listdir("outputs")

['trips_strong.csv', 'task_2_5_kepler_map.html', 'task_2_5_kepler_config.json']