## 1.0e Adding Hexagons

In [1]:
# Standard libraries - run pip install if necessary
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

# Geospatial libraries
from h3 import h3 
import geopandas as gp
import folium
from shapely.ops import unary_union
from shapely.geometry.polygon import Polygon
## Color for map 
import branca
import branca.colormap as cm

In [10]:
df = pd.read_csv("data/prepped/prep_taxidata.csv")

In [11]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,fare,...,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,trip_hours,mph,pickup_community,pickup_area_number,dropoff_community,dropoff_area_number
0,0,4404c6835b9e74e9f74d70f235200a8ce09db14a,7e179f8ef66ae99ec2d1ec89224e0b7ee5469fe5627f6d...,2022-12-31 23:45:00,2023-01-01 00:15:00,2081.0,4.42,,,20.5,...,POINT (-87.6950125892 42.001571027),41.965812,-87.655879,POINT (-87.6558787862 41.96581197),0.578056,7.646324,WEST RIDGE,2,UPTOWN,3
1,1,466473fd2a196ebe92fb2983cb7e8af32e39aa1f,d1d88b89ceb6d753007b6e795e3c24f4bea905a51e9d47...,2022-12-31 23:45:00,2023-01-01 00:00:00,812.0,0.0,,,13.84,...,POINT (-87.6333080367 41.899602111),41.901207,-87.676356,POINT (-87.6763559892 41.9012069941),0.225556,0.0,NEAR NORTH SIDE,8,WEST TOWN,24
2,2,3f5cd3f78e5cab455606a31372a95d3204b2fb3f,847cf962bd6f62040673e6c24c24940aeb2d7fdaa54677...,2022-12-31 23:45:00,2023-01-01 00:00:00,600.0,0.9,,,7.0,...,POINT (-87.6333080367 41.899602111),41.899602,-87.633308,POINT (-87.6333080367 41.899602111),0.166667,5.4,NEAR NORTH SIDE,8,NEAR NORTH SIDE,8


In [12]:
# Drop index col
df = df.drop("Unnamed: 0", axis=1)

In [14]:
df.shape

(5251899, 27)

In [15]:
# Get hex ids
def add_h3_ids(df, res):
    df[f"h3_res{res}_pickup"] = np.vectorize(h3.geo_to_h3)(
        df['pickup_centroid_latitude'], df['pickup_centroid_longitude'], res)
    df[f"h3_res{res}_dropoff"] = np.vectorize(h3.geo_to_h3)(
        df['dropoff_centroid_latitude'], df['dropoff_centroid_longitude'], res)
    return df

# Get poly from hex ids - vectorized form to save time
def poly_from_hex(df, colname, res):
    hex_ids = df[f"h3_res{res}_{colname}"].values
    polygons = np.vectorize(lambda hex_id: Polygon(h3.h3_to_geo_boundary(hex_id, geo_json=True)))(hex_ids)
    df[f"poly_res{res}_{colname}"] = polygons
    return df

# Get count for each trip happening in the same hexagon
def get_poly_count(df, colname):
    name = colname.split("_")[1] + "_" + colname.split("_")[2]
    df[f"count{name}"] = df.groupby(colname)['trip_id'].transform('count')
    return df

To decide on an appropriate size of the hexagons, we adapted the table from https://towardsdatascience.com/exploring-location-data-using-a-hexagon-grid-3509b68b04a2 as a guide. Hence, we found that for the size of the community areas, it would be best to go with a hexagon size of 7 or 8. 

In [17]:
hex_df = add_h3_ids(df, 7)
hex_df = add_h3_ids(df, 8)

In [20]:
hex_df.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,fare,tips,...,trip_hours,mph,pickup_community,pickup_area_number,dropoff_community,dropoff_area_number,h3_res7_pickup,h3_res7_dropoff,h3_res8_pickup,h3_res8_dropoff
0,4404c6835b9e74e9f74d70f235200a8ce09db14a,7e179f8ef66ae99ec2d1ec89224e0b7ee5469fe5627f6d...,2022-12-31 23:45:00,2023-01-01 00:15:00,2081.0,4.42,,,20.5,0.0,...,0.578056,7.646324,WEST RIDGE,2,UPTOWN,3,872664d8effffff,872664d89ffffff,882664d8e1fffff,882664d897fffff
1,466473fd2a196ebe92fb2983cb7e8af32e39aa1f,d1d88b89ceb6d753007b6e795e3c24f4bea905a51e9d47...,2022-12-31 23:45:00,2023-01-01 00:00:00,812.0,0.0,,,13.84,2.73,...,0.225556,0.0,NEAR NORTH SIDE,8,WEST TOWN,24,872664c1effffff,872664cacffffff,882664c1edfffff,882664cac3fffff
2,3f5cd3f78e5cab455606a31372a95d3204b2fb3f,847cf962bd6f62040673e6c24c24940aeb2d7fdaa54677...,2022-12-31 23:45:00,2023-01-01 00:00:00,600.0,0.9,,,7.0,2.0,...,0.166667,5.4,NEAR NORTH SIDE,8,NEAR NORTH SIDE,8,872664c1effffff,872664c1effffff,882664c1edfffff,882664c1edfffff
3,38292159642750da7b20419330566f9eb0961cde,81092e4881f56106fae845c3ae4492f8b3c3213c33c920...,2022-12-31 23:45:00,2023-01-01 00:00:00,546.0,0.85,,,6.5,0.0,...,0.151667,5.604396,NEAR NORTH SIDE,8,NEAR NORTH SIDE,8,872664c1effffff,872664c1effffff,882664c1edfffff,882664c1edfffff
4,3e01498f8ff771ad7eb37e4844cef20201b6c339,4ae32e2eb244ce143800e0c40055e537cc50e3358a07ce...,2022-12-31 23:45:00,2023-01-01 00:00:00,574.0,0.33,,,6.25,0.0,...,0.159444,2.069686,NEAR NORTH SIDE,8,NEAR NORTH SIDE,8,872664c1effffff,872664c1effffff,882664c1edfffff,882664c1edfffff


In [21]:
hex_df.shape

(5251899, 31)

In [23]:
# Check if unusable data is given:
print("Number of hex ids equal to 0: ",len(hex_df[(hex_df["h3_res7_pickup"] == "0") | (hex_df["h3_res7_dropoff"] == "0")]))

Number of hex ids equal to 0:  0


In [24]:
# Get polygon from hex ids
hex_df_poly = poly_from_hex(hex_df, "pickup", 7)


KeyboardInterrupt



In [None]:
hex_df_poly = poly_from_hex(hex_df, "dropoff", 7)

In [None]:
hex_df_poly = poly_from_hex(hex_df, "pickup", 8)

In [29]:
hex_df_poly = poly_from_hex(hex_df, "dropoff", 8)


KeyboardInterrupt



In [None]:
hex_df_poly.head()

In [26]:
hex_df_poly.shape

(5251899, 34)

In [27]:
hex_df_poly.isna().sum()

trip_id                             0
taxi_id                             0
trip_start_timestamp                0
trip_end_timestamp                  0
trip_seconds                        0
trip_miles                          0
pickup_census_tract           2841435
dropoff_census_tract          2841435
fare                                0
tips                                0
tolls                               0
extras                              0
trip_total                          0
payment_type                        0
company                             0
pickup_centroid_latitude            0
pickup_centroid_longitude           0
pickup_centroid_location            0
dropoff_centroid_latitude           0
dropoff_centroid_longitude          0
dropoff_centroid_location           0
trip_hours                          0
mph                                 0
pickup_community                    0
pickup_area_number                  0
dropoff_community                   0
dropoff_area

In [30]:
# Make a geodf out of it for simple plotting
gdf_res7_pickup = gp.GeoDataFrame(hex_df, geometry=hex_df_poly['poly_res7_pickup'], crs='EPSG:4326')
gdf_res7_dropoff = gp.GeoDataFrame(hex_df, geometry=hex_df_poly['poly_res7_dropoff'], crs='EPSG:4326')
# gdf_res8_pickup = gp.GeoDataFrame(hex_df, geometry=hex_df_poly['poly_res8_pickup'], crs='EPSG:4326')
# gdf_res8_dropoff = gp.GeoDataFrame(hex_df, geometry=hex_df_poly['poly_res8_dropoff'], crs='EPSG:4326')

In [None]:
# Visualize
fig, axs = plt.subplots(nrows = 2, ncols = 2, figsize=(10, 10))

titles = ["Pickup (RES 7)", "Dropoff (RES 7)", 
          # "Pickup (RES 8)", "Dropoff (RES 8)"
         ]
dfs = [gdf_res7_pickup, gdf_res7_dropoff,
       # gdf_res8_pickup, gdf_res8_dropoff
      ]

axs = axs.flatten()

for ind in range(0, 3):
    dfs[ind].plot(column="count", ax=axs[ind], legend=True)
    axs[ind].set_title(titles[ind])

plt.tight_layout()
plt.show()

In [None]:
## ADD FUNCTION TO DO THE PLOTTING LIKE BELOW THEN SAVE

In [None]:
# df_pickup_res8['geometry'] = df_pickup_res7.apply(lambda x: Polygon(h3.h3_to_geo_boundary(x["h3_pickup_res8"], geo_json=True)), axis=1)
# trips_starts_geo = gp.GeoDataFrame(df_pickup_res7, geometry=df_pickup_res7['geometry'], crs='EPSG:4326')
# trips_starts_geo.plot(column='count')

In [None]:
## SAVE THIS DF AS TAXI_HEXA_PREP ##