In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
os.environ['BOKEH_ALLOW_WS_ORIGIN'] = 'llm-interface-2-api.legit-ai.co.id'

# Load & Preprocessing

## Data Download

In [2]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("yingwurenjian/chicago-divvy-bicycle-sharing-data")

# print("Path to dataset files:", path)

In [3]:
from pathlib import Path

DATA_DIR = Path("./data")
FILENAME = Path("data.csv")

In [4]:
# # Download and Extract the dataset
# ! wget -N -P {DATA_DIR} https://data.rapids.ai/viz-data/data.tar.xz
# ! tar -xf {DATA_DIR}/data.tar.xz -C {DATA_DIR}

## Imports

In [5]:
import cuxfilter
import cudf
import cugraph
from bokeh.models import NumeralTickFormatter
from bokeh.palettes import Inferno
from pathlib import Path
# from preprocess import * # for compactness we added functions to preprocess.py

In [6]:
# From preprocess.py
from pyproj import Proj, Transformer

## Load Data Into cuDF and Format Data

In [7]:
data = cudf.read_csv(DATA_DIR / FILENAME)

# Check
data.sample()

Unnamed: 0,trip_id,year,month,week,day,hour,usertype,gender,starttime,stoptime,...,from_station_id,from_station_name,latitude_start,longitude_start,dpcapacity_start,to_station_id,to_station_name,latitude_end,longitude_end,dpcapacity_end
841461,3773052,2014,9,39,6,10,Subscriber,Female,2014-09-28 10:40:00,2014-09-28 10:44:00,...,176,Clark St & Elm St,41.903233,-87.631253,19.0,140,Dearborn Pkwy & Delaware Pl,41.899007,-87.629928,19.0


In [8]:
def transform_coords(df, x='x', y='y'):
    transform_4326_to_3857 = Transformer.from_crs('epsg:4326', 'epsg:3857')
    # Use .to_numpy() instead of .to_array()
    df['x'], df['y'] = transform_4326_to_3857.transform(df[x].to_numpy(), df[y].to_numpy())
    return df

trips = transform_coords(data, x='latitude_start', y='longitude_start')


# Note: days 0-4 are weekedays, days 5-6 are weekends 
trips['day_type'] = 0
trips.loc[trips.query('day>4').index, 'day_type'] = 1


# Note: Data always has edge cases, such as the extra week anomalies of 2015 and 2016:
# trips.groupby('year').week.max().to_pandas().to_dict() is {2014: 52, 2015: 53, 2016: 53, 2017: 52}
# Since 2015 and 2016 have 53 weeks, we add 1 to global week count for their following years - 2016 & 2017
# (data.year/2016).astype('int') => returns 1 if year>=2016, else 0
year0 = int(trips.year.min()) #2014
trips['all_time_week'] = data.week + 52*(data.year - year0) + (data.year/2016).astype('int')

#Finally, we remove the unused columns and reorganize our dataframe:
# trips = trips[[
#     'year', 'month', 'week', 'day', 'hour', 'gender', 'from_station_name',
#     'from_station_id', 'to_station_id', 'x', 'y', 'from_station_name', 'to_station_name', 'all_time_week', 'day_type'
# ]]

In [9]:
data.head(1)

Unnamed: 0,trip_id,year,month,week,day,hour,usertype,gender,starttime,stoptime,...,dpcapacity_start,to_station_id,to_station_name,latitude_end,longitude_end,dpcapacity_end,x,y,day_type,all_time_week
0,2355134,2014,6,27,0,23,Subscriber,Male,2014-06-30 23:57:00,2014-07-01 00:07:00,...,15.0,303,Broadway & Cornelia Ave,41.945512,-87.64598,15.0,-9759200.0,5151901.0,0,27


In [10]:
# Remove unnessary data and format using the script in preprocess.py
trips = transform_coords(data, x='latitude_start', y='longitude_start')

trips['day_type'] = 0
trips.loc[trips.query('day>4').index, 'day_type'] = 1

year0 = int(trips.year.min()) #2014
trips['all_time_week'] = data.week + 52*(data.year - year0) + (data.year/2016).astype('int')

# trips_f = trips[[
#     'year', 'month', 'week', 'day', 'hour', 'gender', 'from_station_name',
#     'from_station_id', 'to_station_id', 'x', 'y', 'from_station_name', 'to_station_name', 'all_time_week', 'day_type'
# ]]

# Check
print(f"Columns {len(trips.columns)}: {trips.columns}")
trips.head(1)

Columns 27: Index(['trip_id', 'year', 'month', 'week', 'day', 'hour', 'usertype', 'gender',
       'starttime', 'stoptime', 'tripduration', 'temperature', 'events',
       'from_station_id', 'from_station_name', 'latitude_start',
       'longitude_start', 'dpcapacity_start', 'to_station_id',
       'to_station_name', 'latitude_end', 'longitude_end', 'dpcapacity_end',
       'x', 'y', 'day_type', 'all_time_week'],
      dtype='object')


Unnamed: 0,trip_id,year,month,week,day,hour,usertype,gender,starttime,stoptime,...,dpcapacity_start,to_station_id,to_station_name,latitude_end,longitude_end,dpcapacity_end,x,y,day_type,all_time_week
0,2355134,2014,6,27,0,23,Subscriber,Male,2014-06-30 23:57:00,2014-07-01 00:07:00,...,15.0,303,Broadway & Cornelia Ave,41.945512,-87.64598,15.0,-9759200.0,5151901.0,0,27


In [11]:
# Create a weekday string map
days_of_week_map = {
    0: 'monday',
    1: 'tuesday',
    2: 'wednesday',
    3: 'thursday',
    4: 'friday',
    5: 'saturday',
    6: 'sunday'
}

# month map
month_map = {
    1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul', 8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'
}

# weekend / weekday map
day_type_map = {0:'weekday', 1:'weekend', '':'all'}

# Cuxfilter Basic Dashboard, Adding Charts, and Custom Layouts

In [12]:
cux_df = cuxfilter.DataFrame.from_dataframe(data)

In [13]:
# Inferno Taken from bokeh color pallettes https://docs.bokeh.org/en/latest/docs/reference/palettes.html
colors = Inferno[10]

# Specify the charts and widgets to use with the selected columns of data and string maps
widget1 = cuxfilter.charts.multi_select('year')
widget2 = cuxfilter.charts.multi_select('day_type', label_map=day_type_map)
chart1 = cuxfilter.charts.bar('hour', title='trips per hour')
chart2 = cuxfilter.charts.bar('month', x_label_map=month_map)
chart3 = cuxfilter.charts.bar('day', x_label_map=days_of_week_map)

# Update the yaxis ticker to an easily readable format
chart1.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")
chart2.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")
chart3.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")


# TRY:
# Add chart: 
chart4 = cuxfilter.charts.datashader.heatmap(x='hour', y='day', aggregate_col='hour', point_shape='rect_horizontal', point_size=10, color_palette=colors)

# TRY:
# Use custom layout with `layout_array` parameter:
layout_array = [[1, 2], [3, 2]]

# Preset layout with `layout` parameter
# layout = cuxfilter.layouts.feature_and_double_base

# Generate the dashboard and select a layout
d = cux_df.dashboard([chart1, chart2, chart3, chart4], sidebar=[widget1, widget2], layout_array=layout_array, title='Bike Trips Dashboard')

In [14]:
# d.show(port=8090)

In [15]:
# Specify the charts and widgets to use with the selected columns of data and string maps
chart1 = cuxfilter.charts.bar('all_time_week', title='rides per week')
chart2 = cuxfilter.charts.heatmap(x='all_time_week', y='day', aggregate_col='temperature',
                             aggregate_fn='mean', point_size=40, legend_position='right',
                             title='mean temperature by day')
widget1 = cuxfilter.charts.multi_select('day_type', label_map=day_type_map)

# Update the yaxis ticker to an easily readable format
chart1.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")
chart2.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")

# Generate the dashboard and select a layout
d = cux_df.dashboard([chart1, chart2], sidebar=[widget1], 
                     layout=cuxfilter.layouts.feature_and_base, 
                     title='Temperature Dashboard', 
                     theme=cuxfilter.themes.dark) #options: rapids, light, dark
# TRY: 
# Set different theme options and see the result in d.preview()

In [16]:
# d.show(port=8090)

## Cuxfilter Geospatial Graph Dashboard

In [17]:
# Build Graph
G = cugraph.Graph() 
G.from_cudf_edgelist(data, source='from_station_id', destination='to_station_id')
edges = G.edges()

In [18]:
# Trips have been converted into edges with source and destination based on station IDs.
edges.head()

Unnamed: 0,from_station_id,to_station_id
0,131,303
1,134,194
2,238,316
3,100,192
4,267,322


In [19]:
cux_df = cuxfilter.DataFrame.load_graph((trips, edges))

In [28]:
# Specifying a graph chart type will use Datashader and its required parameters
widget1 = cuxfilter.charts.multi_select('year')
widget2 = cuxfilter.charts.multi_select('day_type', label_map=day_type_map)

chart1 = cuxfilter.charts.graph(
            node_id='from_station_id',
            source='src', 
            edge_target='dst',
            node_aggregate_fn='count',
            # node_pixel_shade_type='linear', 
            node_point_size=35, #node size is fixed set
            edge_render_type='direct', #direct, curved
            edge_transparency=0.7, #0.1 - 0.9
            tile_provider='CARTODBPOSITRON', 
            title='Graph for trip source_stations (color by count)'
        )
chart2 = cuxfilter.charts.bar('from_station_id')
chart3 = cuxfilter.charts.bar('to_station_id')

# Update the yaxis ticker to an easily readable format
chart2.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")
chart3.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")

        
# Generate the dashboard, select a layout and theme
d = cux_df.dashboard([chart1, chart2, chart3], sidebar=[widget1, widget2], layout=cuxfilter.layouts.feature_and_double_base, theme=cuxfilter.themes.rapids, title='Geospatial Trips')

TypeError: graph() got an unexpected keyword argument 'source'

In [22]:
# NOTE: Often a good visualization result only comes from a lot of trial and error
# The below parameters produce useful clustering, but try experimenting with them further
ITERATIONS=500
THETA=10.0

# Using the previously created edge list, we calculate the FA2 layout positions here
trips_force_atlas2_layout = cugraph.layout.force_atlas2(G, 
                max_iter=ITERATIONS,
                strong_gravity_mode=False,
                outbound_attraction_distribution=True,
                lin_log_mode=False,
                barnes_hut_optimize=True,
                barnes_hut_theta=THETA,
                verbose=True)

iteration: 1, speed: 1.5, speed_efficiency: 1.3, jt: 10, swinging: 8.36988e+16, traction: 4.18494e+16
iteration: 2, speed: 2.25, speed_efficiency: 1.69, jt: 10, swinging: 7.56409e+16, traction: 4.60386e+16
iteration: 3, speed: 3.375, speed_efficiency: 2.197, jt: 10, swinging: 2.41294e+15, traction: 8.40309e+15
iteration: 4, speed: 5.0625, speed_efficiency: 2.8561, jt: 10, swinging: 4.89662e+15, traction: 5.81458e+15
iteration: 5, speed: 7.59375, speed_efficiency: 3.71293, jt: 10, swinging: 2.21015e+15, traction: 4.05768e+15
iteration: 6, speed: 11.3906, speed_efficiency: 4.82681, jt: 10, swinging: 1.67745e+15, traction: 2.94359e+15
iteration: 7, speed: 17.0859, speed_efficiency: 6.27485, jt: 10, swinging: 2.41089e+15, traction: 2.55081e+15
iteration: 8, speed: 25.6289, speed_efficiency: 8.15731, jt: 10, swinging: 2.1805e+15, traction: 2.14101e+15
iteration: 9, speed: 38.4434, speed_efficiency: 10.6045, jt: 10, swinging: 1.13082e+15, traction: 1.4476e+15
iteration: 10, speed: 57.665, sp

In [23]:
final_df = trips_force_atlas2_layout.merge(
                trips[['from_station_id', 'from_station_name','to_station_id', 'year', 'hour', 'day_type', 'x', 'y']],
                left_on='vertex',
                right_on='from_station_id',
                suffixes=('', '_original')
)

# Check
final_df.head()

Unnamed: 0,vertex,x,y,from_station_id,from_station_name,to_station_id,year,hour,day_type,x_original,y_original
0,46,-4129.19873,-7893.118164,46,Wells St & Walton St,181,2014,15,0,-9755420.0,5146001.0
1,303,-6022.07666,5135.026855,303,Broadway & Cornelia Ave,157,2014,15,0,-9756706.0,5152821.0
2,19,839.606995,-23765.392578,19,Loomis St & Taylor St,22,2014,15,0,-9758377.0,5141439.0
3,24,-9674.814453,-10489.93457,24,Fairbanks Ct & Grand Ave,50,2014,15,0,-9753883.0,5144794.0
4,309,1451.296875,-3448.668701,309,Leavitt St & Armitage Ave,222,2014,15,0,-9760764.0,5148675.0


In [32]:
cux_df = cuxfilter.DataFrame.load_graph((final_df, edges))

In [38]:
edges.head() #edges

Unnamed: 0,from_station_id,to_station_id
0,131,303
1,134,194
2,238,316
3,100,192
4,267,322


In [None]:
final_df.head() #location

Unnamed: 0,vertex,x,y,from_station_id,from_station_name,to_station_id,year,hour,day_type,x_original,y_original
0,46,-4129.19873,-7893.118164,46,Wells St & Walton St,181,2014,15,0,-9755420.0,5146001.0
1,303,-6022.07666,5135.026855,303,Broadway & Cornelia Ave,157,2014,15,0,-9756706.0,5152821.0
2,19,839.606995,-23765.392578,19,Loomis St & Taylor St,22,2014,15,0,-9758377.0,5141439.0
3,24,-9674.814453,-10489.93457,24,Fairbanks Ct & Grand Ave,50,2014,15,0,-9753883.0,5144794.0
4,309,1451.296875,-3448.668701,309,Leavitt St & Armitage Ave,222,2014,15,0,-9760764.0,5148675.0


In [41]:
# NOTE: Both scatter and graph chart types use Datashader 
chart1 = cuxfilter.charts.graph(
          edge_source='from_station_id', edge_target='to_station_id',
          edge_color_palette=['gray', 'black'],
      #     ode_pixel_shade_type='linear',
          edge_render_type='curved', #curved, direct
          edge_transparency=0.6, #0.1 - 0.9
          title='ForceAtlas2 Graph'
      )
chart2 = cuxfilter.charts.scatter(
        x='x_original', y='y_original', 
        # tile_provider='CARTODBPOSITRON',
        point_size=4,
        pixel_shade_type='linear',
        pixel_spread='spread',
        title='Scatter Layout'
      )
chart3 = cuxfilter.charts.bar('hour', title='Trips per hour')
chart4 = cuxfilter.charts.bar('from_station_id', title='Source station')
chart5 = cuxfilter.charts.bar('to_station_id', title='Destination station')

widget1 = cuxfilter.charts.multi_select('year')
widget2 = cuxfilter.charts.multi_select('day_type', label_map={0:'weekday', 1:'weekend', '':'all'})


# Update the yaxis ticker to an easily readable format
chart3.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")
chart4.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")
chart5.y_axis_tick_formatter = NumeralTickFormatter(format="0,0")

# Custom layout
layout_array_3rds = [[1,1,2],[1,1,2],[3,4,5]]

# Generate the dashboard, select a layout and theme
d = cux_df.dashboard([chart1, chart2, chart3, chart4, chart5], sidebar=[widget1, widget2], layout_array = layout_array_3rds, theme=cuxfilter.themes.rapids, title="Network and Geospatial Graph")

In [42]:
d.show(port=8090)

Dashboard running at port 8090


BokehModel(combine_events=True, render_bundle={'docs_json': {'2d4278a8-9cec-4061-93af-f1d037ac1716': {'version…

2024-12-15 09:00:47,647 ERROR: panel.reactive - Callback failed for object named 'day_type' changing property {'value': ['', 1]} 
Traceback (most recent call last):
  File "/home/ai-lead/miniconda3/envs/sna-playground/lib/python3.11/site-packages/panel/reactive.py", line 461, in _process_events
    self.param.update(**self_params)
  File "/home/ai-lead/miniconda3/envs/sna-playground/lib/python3.11/site-packages/param/parameterized.py", line 2319, in update
    restore = dict(self_._update(arg, **kwargs))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ai-lead/miniconda3/envs/sna-playground/lib/python3.11/site-packages/param/parameterized.py", line 2352, in _update
    self_._batch_call_watchers()
  File "/home/ai-lead/miniconda3/envs/sna-playground/lib/python3.11/site-packages/param/parameterized.py", line 2546, in _batch_call_watchers
    self_._execute_watcher(watcher, events)
  File "/home/ai-lead/miniconda3/envs/sna-playground/lib/python3.11/site-packages/param/parame

2024-12-15 09:00:47,655 - tornado.application - ERROR - Exception in callback functools.partial(<bound method IOLoop._discard_future_result of <tornado.platform.asyncio.AsyncIOMainLoop object at 0x7f8307e263d0>>, <Task finished name='Task-978761' coro=<ServerSession.with_document_locked() done, defined at /home/ai-lead/miniconda3/envs/sna-playground/lib/python3.11/site-packages/bokeh/server/session.py:77> exception=MixedTypeError('Cannot create column with mixed types')>)
Traceback (most recent call last):
  File "/home/ai-lead/miniconda3/envs/sna-playground/lib/python3.11/site-packages/tornado/ioloop.py", line 750, in _run_callback
    ret = callback()
          ^^^^^^^^^^
  File "/home/ai-lead/miniconda3/envs/sna-playground/lib/python3.11/site-packages/tornado/ioloop.py", line 774, in _discard_future_result
    future.result()
  File "/home/ai-lead/miniconda3/envs/sna-playground/lib/python3.11/site-packages/bokeh/server/session.py", line 98, in _needs_document_lock_wrapper
    result