# Porto Taxi Data Preprocessing

In [None]:
import holoviews as hv
from hvplot import pandas
from holoviews import opts
from holoviews.element import tiles

import sys
sys.path.append("..")
from mobiml.datasets import PortoTaxis
import pandas as pd
import geopandas as gpd
import movingpandas as mpd
import h3
from shapely.geometry import Polygon
from bokeh.io import output_notebook

output_notebook()

opts.defaults(opts.Overlay(active_tools=['wheel_zoom']))
BG_TILES = tiles.CartoLight()

#### Approach 1: data for 1 day (possible to automate)

In [None]:
#load all taxi data records
taxis_all = pd.read_csv(r"../examples/data/train.csv") #change to your file setup

In [None]:
#create new col "date" with human-readable date in GMT
taxis_all["date"] = pd.to_datetime(taxis_all["TIMESTAMP"], utc=True, unit='s', origin='unix')
taxis_all.head(2)

In [None]:
#create cols to break down "date"
taxis_all["year"] = taxis_all["date"].dt.year
taxis_all["month"] = taxis_all["date"].dt.month
taxis_all["day"] = taxis_all["date"].dt.day
taxis_all["hour"] = taxis_all["date"].dt.hour
taxis_all["minute"] = taxis_all["date"].dt.minute
taxis_all["second"] = taxis_all["date"].dt.second

In [None]:
taxis_all = taxis_all.set_index("date")

In [None]:
#https://stackoverflow.com/a/67604265

#create list with df entries for 365 days
results = [part for _, part in taxis_all.groupby(pd.Grouper(freq="1D"))]

In [None]:
#apply PortoTaxis for 1 day (01.07.2013)
taxi_2013_7_1 = PortoTaxis(results[0].reset_index())
taxi_2013_7_1.df.head()

In [None]:
taxi_2013_7_1.to_trajs()

In [None]:
#create gdf and "x" and "y" cols from "geometry" 
taxi_gdf = taxi_2013_7_1.to_gdf()
taxi_gdf["x"] = taxi_gdf.geometry.x
taxi_gdf["y"] = taxi_gdf.geometry.y
taxi_gdf.head(2)

In [None]:
#https://medium.com/@jesse.b.nestler/how-to-convert-h3-cell-boundaries-to-shapely-polygons-in-python-f7558add2f63
#x=long, y=lat

#identify h3 cell
res = 7 
col = f"H3_{res}"
taxi_gdf[col] = taxi_gdf.apply(lambda row: str(h3.geo_to_h3(row.y, row.x, res)), axis=1)

In [None]:
#remove "0" to avoid error
taxi_gdf = taxi_gdf[taxi_gdf.H3_7 != "0"]

In [None]:
#https://medium.com/@jesse.b.nestler/how-to-convert-h3-cell-boundaries-to-shapely-polygons-in-python-f7558add2f63

#group "traj_id" by h3 cell and get count
h3_df = taxi_gdf.groupby([col])['traj_id'].count().reset_index() #add "hour" to .groupby to have counts per hour 
h3_df = h3_df.rename(columns={"traj_id":"traj_id_count"})
h3_df

In [None]:
#https://medium.com/@jesse.b.nestler/how-to-convert-h3-cell-boundaries-to-shapely-polygons-in-python-f7558add2f63

#create polygons to visualize in QGIS
def cell_to_shapely(cell):
    coords = h3.h3_to_geo_boundary(cell)
    flipped = tuple(coord[::-1] for coord in coords)
    return Polygon(flipped)

h3_geoms = h3_df[col].apply(lambda x: cell_to_shapely(x))
h3_gdf = gpd.GeoDataFrame(data=h3_df, geometry=h3_geoms, crs=4326)

In [None]:
h3_gdf.to_file(r"../examples/data/taxi_h3.gpkg", driver="GPKG", layer="traj_id")

In [None]:
#test h3 cell for "traj_id" count
df_cell = taxi_gdf[taxi_gdf["H3_7"] == "87392201dffffff"]

In [None]:
#"traj_id" count in cell
len(df_cell)

In [None]:
#(sub)trajectories in cell
traj_collection = mpd.TrajectoryCollection(df_cell, "traj_id", t="timestamp", x="x", y="y")
print(traj_collection)

In [None]:
#view (sub)trajectories in cell in QGIS
traj_gdf = traj_collection.to_traj_gdf()
traj_gdf.to_file(r"../examples/data/traj_gdf.gpkg", driver="GPKG", layer="traj_id")

#### Approach 2: data for 1 day

In [None]:
taxis_all = pd.read_csv(r"../examples/data/train.csv")

In [None]:
taxis_all["date"] = pd.to_datetime(taxis_all["TIMESTAMP"], utc=True, unit='s', origin='unix')
taxis_all.head(2)

In [None]:
taxis_all["year"] = taxis_all["date"].dt.year
taxis_all["month"] = taxis_all["date"].dt.month
taxis_all["day"] = taxis_all["date"].dt.day
taxis_all["hour"] = taxis_all["date"].dt.hour
taxis_all["minute"] = taxis_all["date"].dt.minute
taxis_all["second"] = taxis_all["date"].dt.second

In [None]:
#create df for 01.07.2013
t_2013_7_1 = taxis_all[(taxis_all["day"] == 1) & (taxis_all["month"] == 7) & (taxis_all["year"] == 2013)]
t_2013_7_1.head(2)

In [None]:
t_2013_7_1 = t_2013_7_1.rename(columns={"TRIP_ID":"traj_id"})

In [None]:
t_2013_7_1_PT = PortoTaxis(t_2013_7_1)
t_2013_7_1_PT.df.head()

In [None]:
t_2013_7_1_PT.to_trajs()

In [None]:
t_gdf = t_2013_7_1_PT.to_gdf()
t_gdf["x"] = t_gdf.geometry.x
t_gdf["y"] = t_gdf.geometry.y
t_gdf.head(2)

In [None]:
#https://medium.com/@jesse.b.nestler/how-to-convert-h3-cell-boundaries-to-shapely-polygons-in-python-f7558add2f63
#x=long, y=lat
res = 7 
col = f"H3_{res}"
t_gdf[col] = t_gdf.apply(lambda row: str(h3.geo_to_h3(row.y, row.x, res)), axis=1)

In [None]:
t_gdf = t_gdf[t_gdf.H3_7 != "0"]

In [None]:
#https://medium.com/@jesse.b.nestler/how-to-convert-h3-cell-boundaries-to-shapely-polygons-in-python-f7558add2f63
h3_df = t_gdf.groupby([col])['traj_id'].count().reset_index()
h3_df = h3_df.rename(columns={"traj_id":"traj_id_count"})
h3_df

In [None]:
#https://medium.com/@jesse.b.nestler/how-to-convert-h3-cell-boundaries-to-shapely-polygons-in-python-f7558add2f63
def cell_to_shapely(cell):
    coords = h3.h3_to_geo_boundary(cell)
    flipped = tuple(coord[::-1] for coord in coords)
    return Polygon(flipped)

h3_geoms = h3_df[col].apply(lambda x: cell_to_shapely(x))
h3_gdf = gpd.GeoDataFrame(data=h3_df, geometry=h3_geoms, crs=4326)

In [None]:
h3_gdf.to_file(r"../examples/data/taxi_h3_approach2.gpkg", driver="GPKG", layer="traj_id")

#### Approach 3: data for 1 day for 1 hour

In [None]:
taxi_2013_7_1_10 = taxis_all[(taxis_all["year"] == 2013) & (taxis_all["month"] == 7) & (taxis_all["day"] == 1) & (taxis_all["hour"] == 10) ]
taxi_2013_7_1_10.head(2)

In [None]:
taxi_2013_7_1_10 = taxi_2013_7_1_10.rename(columns={"TRIP_ID":"traj_id"})

In [None]:
taxi_2013_7_1_10_PT = PortoTaxis(taxi_2013_7_1_10)
taxi_2013_7_1_10_PT.df.head()

In [None]:
taxi_2013_7_1_10_PT.to_trajs()

In [None]:
taxi_gdf_10 = taxi_2013_7_1_10_PT.to_gdf()
taxi_gdf_10["x"] = taxi_gdf_10.geometry.x
taxi_gdf_10["y"] = taxi_gdf_10.geometry.y
taxi_gdf_10.head(2)

In [None]:
#https://medium.com/@jesse.b.nestler/how-to-convert-h3-cell-boundaries-to-shapely-polygons-in-python-f7558add2f63
#x=long, y=lat
res = 7 
col = f"H3_{res}"
taxi_gdf_10[col] = taxi_gdf_10.apply(lambda row: str(h3.geo_to_h3(row.y, row.x, res)), axis=1)

In [None]:
taxi_gdf_10 = taxi_gdf_10[taxi_gdf_10.H3_7 != "0"]

In [None]:
#https://medium.com/@jesse.b.nestler/how-to-convert-h3-cell-boundaries-to-shapely-polygons-in-python-f7558add2f63
h3_df = taxi_gdf_10.groupby([col])['traj_id'].count().reset_index()
h3_df = h3_df.rename(columns={"traj_id":"traj_id_count"})
h3_df

In [None]:
#https://medium.com/@jesse.b.nestler/how-to-convert-h3-cell-boundaries-to-shapely-polygons-in-python-f7558add2f63
def cell_to_shapely(cell):
    coords = h3.h3_to_geo_boundary(cell)
    flipped = tuple(coord[::-1] for coord in coords)
    return Polygon(flipped)

h3_geoms = h3_df[col].apply(lambda x: cell_to_shapely(x))
h3_gdf = gpd.GeoDataFrame(data=h3_df, geometry=h3_geoms, crs=4326)

In [None]:
h3_gdf.to_file(r"../examples/data/taxi_h3_approach3.gpkg", driver="GPKG", layer="traj_id")