# Importing Libraries

In [1]:
import pandas as pd
import os
from pathlib import Path
from keplergl import KeplerGl
import json

# Use your own directory
DATA_FP = Path("C:/Users/faisa/Desktop/Data analysis/Python Specialization/Achievement 2/02 Data/Updated Data/df_2.4.csv")

# Create an output folder for results
OUTPUT_DIR = DATA_FP.parent / "Visualizations"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Data file exists:", DATA_FP.exists())

Data file exists: True


# Advanced grospacial plotting

## Loading the dataset

In [2]:
# loading the dataframe
df = pd.read_csv(DATA_FP, parse_dates=['started_at', 'ended_at', 'date'], low_memory=False)

print("Columns available:", df.columns.tolist())
print("Rows:", len(df))

Columns available: ['Unnamed: 0', 'ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'user_type', 'date', 'temperature', 'trip_duration', 'month']
Rows: 895485


In [3]:
# keeping only the required columns
required_trip_cols = ['start_station_name', 'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng']
missing = [c for c in required_trip_cols if c not in df.columns]
if missing:
    raise RuntimeError(f'Missing required trip columns: {missing}')

In [4]:
required_trip_cols

['start_station_name',
 'end_station_name',
 'start_lat',
 'start_lng',
 'end_lat',
 'end_lng']

## Creating "Count" columns and aggregating dataframe

In [5]:
# Add a column of 1 (each row = one trip)
df['value'] = 1

# Aggregate trips by start and end station
df_grouped = (
    df.groupby([
        'start_station_name', 'end_station_name',
        'start_lat', 'start_lng',
        'end_lat', 'end_lng'
    ])['value']
    .sum()
    .rename('trips')
    .reset_index()
)

print("Original trip count:", len(df))
print("Aggregated trip sum:", int(df_grouped['trips'].sum()))

Original trip count: 895485
Aggregated trip sum: 892281


In [6]:
df_grouped.head()

Unnamed: 0,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,trips
0,11 St & Washington St,11 St & Washington St,40.747251,-74.027879,40.749985,-74.02715,1
1,11 St & Washington St,11 St & Washington St,40.749817,-74.027383,40.749985,-74.02715,1
2,11 St & Washington St,11 St & Washington St,40.749857,-74.02753,40.749985,-74.02715,1
3,11 St & Washington St,11 St & Washington St,40.749882,-74.02738,40.749985,-74.02715,1
4,11 St & Washington St,11 St & Washington St,40.749885,-74.027409,40.749985,-74.02715,1


## Initializing Kepler.gl map

In [9]:
# Preparing arcs DataFrame for kepler
arcs_df = pd.DataFrame({
    'start_lat': df_grouped['start_lat'],
    'start_lng': df_grouped['start_lng'],
    'end_lat': df_grouped['end_lat'],
    'end_lng': df_grouped['end_lng'],
    'start_station_name': df_grouped['start_station_name'],
    'end_station_name': df_grouped['end_station_name'],
    'trips': df_grouped['trips'],
})

# Limit to top 500 trips
# TOP_N = 500
# top_arcs = arcs_df.nlargest(TOP_N, 'trips').copy()


In [17]:
# initializing the map and customizing the map inside kepler.gl map
m = KeplerGl(height=800, data={"popular_trips": arcs_df})
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'popular_trips':         start_lat  start_lng    end_lat    end_lng     start_station_name  \
0…

In [12]:
# saving the settings
config = m.config

## Saving and exporting the customized map.

In [13]:
m.save_to_html(file_name='Citibike bike trips.html', read_only = False, config = config)

Map saved to Citibike bike trips.html!


In [14]:
# Exporting the config file as a json file
import json
with open("config.json", "w") as outfile:
    json.dump(config, outfile)

In this visualization, I customized the Kepler map to highlight CitiBike trip flows.

I changed station points to green so they stand out against the base map.

I added an Arc Layer connecting start and end stations, with arc thickness and color mapped to count, so busier routes are emphasized.

I used a yellow color for the start of the journey and reddish for the end of the journey to clearly distinguish between them.

I also added a filter on count, which allows me to dynamically explore only the busiest trip flows, specifically over 500 trips.