In [7]:
!pip install vaex
!pip install pandas h3 folium geojson matplotlib shapely geopandas

Collecting shapely
  Using cached shapely-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
Collecting geopandas
  Using cached geopandas-0.13.2-py3-none-any.whl (1.1 MB)
Collecting numpy>=1.14
  Using cached numpy-1.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
Collecting pandas>=1.1.0
  Using cached pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Collecting fiona>=1.8.19
  Using cached Fiona-1.9.4.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
Collecting pyproj>=3.0.1
  Using cached pyproj-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.9 MB)
Collecting click-plugins>=1.0
  Using cached click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting click~=8.0
  Using cached click-8.1.6-py3-none-any.whl (97 kB)
Collecting cligj>=0.5
  Using cached cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting tzdata>=2022.1
  Using cached tzdata-2023.3-py2.py3-none-any

# Improvement ideas
- Choropleth for the areas with different colors for the usage
- Flow-Arc for most trips pickup >> dropoff
- Dot-Maps for dropoff or maybe payment card vs. cash
- Heatmaps at 4 different points of the day

In [4]:
import pandas as pd
import numpy as np
import h3
import folium
from folium import Map, Marker, GeoJson
import json
from geojson.feature import *
import os
import sys
import shapely
import shapely.wkt
from shapely.geometry import Polygon
import geopandas as gpd
from folium import plugins
from folium.plugins import HeatMap
import vaex

## Constants

In [None]:
# constants

# The resolution was tested, when decreasing the number of polygons does not decrease
resolution = 10
city_coords = [41.8781, -87.6298]

## Import and clean data

In [None]:
df_cleaned = vaex.from_csv('./data/cleaned_trips_with_location_small.csv')

df_cleaned = df_cleaned.dropna(['dropoff_census_tract', 'pickup_census_tract'])

df_cleaned['dropoff_census_tract'] = df_cleaned['dropoff_census_tract'].astype('int')
df_cleaned['pickup_census_tract'] = df_cleaned['pickup_census_tract'].astype('int')

df_cleaned.get_column_names()

In [None]:
# Step 1: For each pickup and drop-off calculate the correct hexagon in the resolution 7
df_cleaned['pickup_hex'] = df_cleaned.apply(geo_to_h3, [df_cleaned['pickup_centroid_latitude'], df_cleaned['pickup_centroid_longitude']])
df_cleaned['dropoff_hex'] = df_cleaned.apply(geo_to_h3, [df_cleaned['dropoff_centroid_latitude'], df_cleaned['dropoff_centroid_longitude']])

## Global Functions

In [None]:
def geo_to_h3(col1, col2):
    return h3.geo_to_h3(col1,col2, resolution)

In [None]:
def hex_geo_id_to_polygon(hex_id):
    return Polygon(h3.h3_to_geo_boundary(h=hex_id, geo_json=True))

## Display Hexagons

In [None]:
unique_pickup_values = df_cleaned['pickup_hex'].unique()
unique_dropoff_values = df_cleaned['dropoff_hex'].unique()

# Using set to handle uniqueness after concatenation
combined_unique_values = list(set(unique_pickup_values + unique_dropoff_values))

hex_geo = []

# Build shapely Polygons for each hexagon
for nh in combined_unique_values:
    hex_geo.append(hex_geo_id_to_polygon(nh))

In [None]:
gdf = gpd.GeoDataFrame(geometry=hex_geo, crs='EPSG:4326')
gdf.explore(cmap='viridis', tiles='OpenStreetMap')

## Display Hexagons with pickup and dropoff count

In [None]:
pickup_counts = df_cleaned.groupby(by='pickup_hex', agg=vaex.agg.count())
pickup_counts.rename("_count","pickup_counts")
pickup_counts.rename("pickup_hex","hex")

dropoff_counts = df_cleaned.groupby(by='dropoff_hex', agg=vaex.agg.count())
dropoff_counts.rename("_count","dropoff_counts")
dropoff_counts.rename("dropoff_hex","hex")

merged_df = pd.merge(pd.DataFrame(combined_unique_values, columns=['hex']),
                     pickup_counts.to_pandas_df(), on='hex', how='left').merge(dropoff_counts.to_pandas_df(), on='hex', how='left')
merged_df['pickup_counts'] = merged_df['pickup_counts'].fillna(0)
merged_df['dropoff_counts'] = merged_df['dropoff_counts'].fillna(0)
merged_df = merged_df.reset_index()

# Add geometry
merged_df['geometry'] = merged_df['hex'].apply(lambda x: hex_geo_id_to_polygon(x))

In [None]:
gdf = gpd.GeoDataFrame(merged_df, crs='EPSG:4326')
gdf.explore(cmap='viridis', tiles='OpenStreetMap')

## Display Heatmap

In [None]:


hexagon_heat_map = folium.Map(location=[41.8781, -87.6298], zoom_start=11)
df = pickup_counts.to_pandas_df()
# Rename columns
df.columns = ['geo', 'value']

def h3_to_polygon(h3_index):
    geo_coordinates = h3.h3_to_geo_boundary(h3_index)
    return Polygon(geo_coordinates)

df['geometry'] = df['geo'].apply(h3_to_polygon)


gdf = gpd.GeoDataFrame(df, geometry='geometry')
# Define a custom function to combine two columns into a list
def combine_columns(r1, r2):
    return list([r1, r2])

# Apply the custom function to create a new column
df_cleaned['combined'] = df_cleaned.apply(combine_columns, [df_cleaned.pickup_centroid_latitude, df_cleaned.pickup_centroid_longitude])

# Convert vaex DataFrame to pandas DataFrame
heat_data = df_cleaned['combined'].values
HeatMap(heat_data).add_to(hexagon_heat_map)

hexagon_heat_map

# Census Tracts
A census tract is a small, relatively permanent statistical subdivision of a county or equivalent entity that is defined by the U.S. Census Bureau. Census tracts are designed to be relatively homogeneous in terms of population characteristics, economic status, and living conditions. They typically contain between 1,200 and 8,000 people, with an optimum size of about 4,000 people.

In [1]:
def transform_column_to_geometry(df, column_name):
    geo = []

    for index, row in df.iterrows():
        polygon = row[column_name]
        geojson = shapely.wkt.loads(polygon)
        geo.append(geojson)

    df['geometry'] = geo
    return df

In [2]:
def create_census_map(map, df, color='#0000ff'):

    print("called function")

    for index, row in df.iterrows():
        polygon = shapely.wkt.loads(row['the_geom'])

        geojson = folium.GeoJson(polygon.__geo_interface__, style_function = lambda x: {'fillColor': color, 'color': color})

        if 'label' in df and row['label'] is not None:
            marker = folium.Marker(location=[polygon.centroid.y, polygon.centroid.x], popup=row['label'])
            marker.add_to(map)

        geojson.add_to(map)

    return map

## Display Census Tracts

In [5]:
df_census_tracts = pd.read_csv("./data/chicago_census_tracts.csv")

df_census_tracts = transform_column_to_geometry(df_census_tracts, 'the_geom')

gdf = gpd.GeoDataFrame(df_census_tracts, crs='EPSG:4326')
gdf.explore(cmap='viridis', tiles='OpenStreetMap')

ImportError: The 'folium', 'matplotlib' and 'mapclassify' packages are required for 'explore()'. You can install them using 'conda install -c conda-forge folium matplotlib mapclassify' or 'pip install folium matplotlib mapclassify'.

## Display Community Area
A community area is a larger administrative division within a city or metropolitan area. It is a way of organizing neighborhoods and communities for planning and statistical purposes.

In [None]:
df_community_areas = pd.read_csv("./data/community_areas.csv")
df_community_areas = transform_column_to_geometry(df_community_areas, 'the_geom')

gdf = gpd.GeoDataFrame(df_community_areas, crs='EPSG:4326')
gdf.explore(cmap='viridis', tiles='OpenStreetMap')

In [None]:
df_community_areas = pd.read_csv("./data/community_areas.csv")
community_areas_map = folium.Map(location=[41.8781, -87.6298], zoom_start=11)

for index, row in df_community_areas.iterrows():
    polygon = row['the_geom']
    geojson = folium.GeoJson(shapely.wkt.loads(polygon).__geo_interface__)
    geojson.style_function = lambda x: {'fillColor': 'red', 'color': 'red'}
    geojson.add_to(community_areas_map)

community_areas_map

# Analysis

## Census tracts with most pickups

In [None]:
most_pickups = df_cleaned.groupby(by='pickup_census_tract', agg=vaex.agg.count())
most_pickups.rename('_count', 'count')
most_pickups = most_pickups.sort(by='count', ascending=False)
most_pickups = most_pickups.to_pandas_df()
most_pickups = most_pickups.reset_index()
df_most_pickups_census = df_census_tracts.copy()

# Visualize
most_pickups_list = most_pickups['pickup_census_tract'].tolist()

# Filter top
df_most_pickups_census = df_most_pickups_census[df_most_pickups_census['GEOID10'].isin(most_pickups_list)]

# Add values
for index, elem in most_pickups.iterrows():
    mask = df_most_pickups_census['GEOID10'] == elem['pickup_census_tract']
    df_most_pickups_census.loc[mask, "count"] = elem['count']
    

df_most_pickups_census = df_most_pickups_census.loc[:, ['the_geom', 'count', 'NAME10']]
df_most_pickups_census = transform_column_to_geometry(df_most_pickups_census, 'the_geom')

df_most_pickups_census = df_most_pickups_census.loc[:, ['geometry', 'count', 'NAME10']]

gdf = gpd.GeoDataFrame(df_most_pickups_census, crs='EPSG:4326')
gdf.explore(column='count',cmap='viridis', tiles='OpenStreetMap')

## Census tracts with most dropoffs

In [None]:
most_dropoffs = df_cleaned.groupby(by='dropoff_census_tract', agg=vaex.agg.count())
most_dropoffs.rename('_count', 'count')
most_dropoffs = most_dropoffs.sort(by='count', ascending=False)
most_dropoffs = most_dropoffs.to_pandas_df()
most_dropoffs = most_dropoffs.reset_index()
df_most_dropoffs_census = df_census_tracts.copy()

# Visualize
most_dropoffs_list = most_dropoffs['dropoff_census_tract'].tolist()

# Filter empty
df_most_dropoffs_census = df_most_dropoffs_census[df_most_dropoffs_census['GEOID10'].isin(most_dropoffs_list)]

#Add values
for index, elem in most_dropoffs.iterrows():
    df_most_dropoffs_census.loc[df_most_dropoffs_census['GEOID10'] == elem['dropoff_census_tract'], "count"]  = elem['count']


df_most_dropoffs_census = df_most_dropoffs_census.loc[:, ['the_geom', 'count', 'NAME10']]
df_most_dropoffs_census = transform_column_to_geometry(df_most_dropoffs_census, 'the_geom')

df_most_dropoffs_census = df_most_dropoffs_census.loc[:, ['geometry', 'count', 'NAME10']]

gdf = gpd.GeoDataFrame(df_most_dropoffs_census, crs='EPSG:4326')
gdf.explore(column='count',cmap='viridis', tiles='OpenStreetMap')

## Top 5 of the most driven routes
Red means start and green end

In [None]:
most_driven_routes_map = folium.Map(location=[41.8781, -87.6298], zoom_start=11)

routes_counts = df_cleaned.groupby(by=['pickup_census_tract', 'dropoff_census_tract'], agg=vaex.agg.count())
routes_counts.rename('_count', 'count')
sorted_routes = routes_counts.sort(by='count', ascending=False)
sorted_routes = sorted_routes.to_pandas_df()
top_routes = sorted_routes.head(10)

polygons = df_census_tracts
polygons = polygons.set_index('GEOID10')['the_geom']

census_tracts = list(set(pd.concat([top_routes['pickup_census_tract'], top_routes['dropoff_census_tract']]).tolist()))

# First add the polygons, we do not differentiate here if it is pickup or dropoff
for cs in census_tracts:
        if cs in polygons:
            polygon = shapely.wkt.loads(polygons[cs])
            geojson = folium.GeoJson(polygon.__geo_interface__)
            geojson.add_to(most_driven_routes_map)

# Add the lines. In order to have a clue where the start and end is, we use an experimental feature from folium,
# the color line, here we add a red color from the start to the middle and a green color from the middle to the end
for index, row in top_routes.iterrows():

    if row['pickup_census_tract'] in polygons and row['dropoff_census_tract'] in polygons:
        start_poly = shapely.wkt.loads(polygons[row['pickup_census_tract']])
        end_poly = shapely.wkt.loads(polygons[row['dropoff_census_tract']])

        lat_mid = (start_poly.centroid.y + end_poly.centroid.y) / 2
        lon_mid = (start_poly.centroid.x + end_poly.centroid.x) / 2

        color_line = folium.features.ColorLine(
            [[start_poly.centroid.y, start_poly.centroid.x], [lat_mid, lon_mid], [end_poly.centroid.y, end_poly.centroid.x]],
            [0, 1],
            colormap=['red', 'green'],
            weight=5
        )

        color_line.add_to(most_driven_routes_map)

print('Red means start and green end')

most_driven_routes_map

## Pickup Areas with the longest routes

In [None]:
most_miles_vaex = df_cleaned.groupby(by='pickup_census_tract', agg={'max_trip_miles': vaex.agg.max('trip_miles')})
most_miles_vaex = most_miles_vaex.sort(by='max_trip_miles', ascending=False)
df_most_miles = most_miles_vaex.to_pandas_df()
df_pickups_most_miles_census = df_census_tracts.copy()

# Filter empty census tracts
most_miles_pickup_list = df_most_miles['pickup_census_tract'].tolist()
df_pickups_most_miles_census = df_pickups_most_miles_census[df_pickups_most_miles_census['GEOID10'].isin(most_miles_pickup_list)]

#Add values
for index, elem in df_most_miles.iterrows():
    df_pickups_most_miles_census.loc[df_pickups_most_miles_census['GEOID10'] == elem['pickup_census_tract'], "max_trip_miles"]  = elem['max_trip_miles']


df_pickups_most_miles_census = df_pickups_most_miles_census.loc[:, ['the_geom', 'max_trip_miles', 'NAME10']]
df_pickups_most_miles_census = transform_column_to_geometry(df_pickups_most_miles_census, 'the_geom')

df_pickups_most_miles_census = df_pickups_most_miles_census.loc[:, ['geometry', 'max_trip_miles', 'NAME10']]

gdf = gpd.GeoDataFrame(df_pickups_most_miles_census, crs='EPSG:4326')
gdf.explore(column='max_trip_miles',cmap='viridis', tiles='OpenStreetMap')

## Pickup areas by Tip height

In [None]:
df_trips_by_tip = df_cleaned.groupby(by='pickup_census_tract', agg={'mean_tips': vaex.agg.mean('tips'), 'count': vaex.agg.count()})
df_trips_by_tip = df_trips_by_tip.to_pandas_df()
df_census_highest_tip = df_census_tracts.copy()

df_trips_by_tip.columns = ['pickup_census_tract', 'average_tip', 'number_of_trips']

df_census_pickup_highest_tip = df_trips_by_tip.sort_values(['average_tip'], ascending=False)
df_census_pickup_highest_tip.index = range(0, len(df_census_pickup_highest_tip))

# Filter empty census tracts
highest_tip_pickup_list = df_census_pickup_highest_tip['pickup_census_tract'].tolist()
df_census_highest_tip = df_census_highest_tip[df_census_highest_tip['GEOID10'].isin(highest_tip_pickup_list)]

#Add values
for index, elem in df_census_pickup_highest_tip.iterrows():
    df_census_highest_tip.loc[df_census_highest_tip['GEOID10'] == elem['pickup_census_tract'], "average_tip"]  = elem['average_tip']


df_census_highest_tip = df_census_highest_tip.loc[:, ['the_geom', 'average_tip', 'NAME10']]
df_census_highest_tip = transform_column_to_geometry(df_census_highest_tip, 'the_geom')

df_census_highest_tip = df_census_highest_tip.loc[:, ['geometry', 'average_tip', 'NAME10']]

gdf = gpd.GeoDataFrame(df_census_highest_tip, crs='EPSG:4326')
gdf.explore(column='average_tip',cmap='viridis', tiles='OpenStreetMap')

## Taxis with the longest trip

In [None]:
# Groupby and aggregation
taxis_with_longest_distance = df_cleaned.groupby(by='taxi_id', agg={'trip_miles_sum': vaex.agg.sum('trip_miles')})

# Sorting
taxis_with_longest_distance = taxis_with_longest_distance_vaex.sort(by='trip_miles_sum', ascending=False)

taxis_with_longest_distance.head(10)

## Most used payment methods for every area

In [None]:
payment_map = folium.Map(location=[41.8781, -87.6298], zoom_start=11)
payment_counts = df_cleaned.groupby(by=['pickup_census_tract', 'payment_type'], agg=vaex.agg.count())
payment_counts.rename('_count', 'count')
payment_counts = payment_counts.to_pandas_df()

# Find the maximum count for each place ID
max_counts = payment_counts.groupby('pickup_census_tract')['count'].max()

# Create two separate DataFrames for each payment type based on the maximum counts
cash_payments = payment_counts[payment_counts['payment_type'] == 'Cash'].loc[payment_counts['count'].isin(max_counts)]
credit_card_payments = payment_counts[payment_counts['payment_type'] == 'Credit Card'].loc[payment_counts['count'].isin(max_counts)]

# Extract unique place IDs for each payment type
cash_place_ids = cash_payments['pickup_census_tract'].unique().tolist()
credit_card_place_ids = credit_card_payments['pickup_census_tract'].unique().tolist()

df_payment_census = df_census_tracts
df_payment_census['count'] = None
df_payment_census['label'] = None

# Filter top
df_cash_census = df_payment_census[df_payment_census['GEOID10'].isin(cash_place_ids)]
df_credit_card_census = df_payment_census[df_payment_census['GEOID10'].isin(credit_card_place_ids)]


payment_map = create_census_map(payment_map, df_cash_census, 'green')
payment_map = create_census_map(payment_map, df_credit_card_census, 'red')
print("green is cash; red is credit card")

payment_map