## Data exploration

Adapted from: https://www.kaggle.com/code/nagellette/taxi-trajectory-data-analysis

In [1]:
import numpy as np
import os
import pandas as pd
import rich
import sys
from functools import partial
from tqdm import tqdm

In [2]:
data_path = "data/train.csv"

In [3]:
def load_data(path: str):
    df = pd.read_csv(path)
    return df

_df = load_data(data_path)
_df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


The column that we care about here is the `POLYLINE` column. This column contains a list of GPS coordinates that represent the path that the taxi took. The first and last coordinates in the list represent the start and end points of the trip, respectively. The rest of the coordinates represent the path that the taxi took from the start to the end point. We'll preprocess the data here to make it easier to work with later with `geopandas`.

In [4]:
df = _df.copy()

def preprocess_data(df: pd.DataFrame):
    df.drop(df[df["MISSING_DATA"] == True].index, axis=0, inplace=True)
    df["POLYLINE"] = df["POLYLINE"].str.replace("\[", "", regex=False)
    df["POLYLINE"] = df["POLYLINE"].str.replace("\]", "", regex=False)
    df["geo_len"] = df["POLYLINE"].apply(lambda x: len(x))
    df["POLYLINE"] = df["POLYLINE"].apply(lambda x: x.split(","))
    df["POLYLINE"] = df["POLYLINE"].str.join(" ")
    df.drop(df[df["geo_len"] == 0].index, axis=0, inplace=True)


preprocess_data(df)
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,geo_len
0,1372636858620000589,C,,,20000589,1372636858,A,False,[[-8.618643 41.141412] [-8.618499 41.141376] [...,500
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,[[-8.639847 41.159826] [-8.640351 41.159871] [...,413
2,1372636951620000320,C,,,20000320,1372636951,A,False,[[-8.612964 41.140359] [-8.613378 41.14035] [-...,1408
3,1372636854620000520,C,,,20000520,1372636854,A,False,[[-8.574678 41.151951] [-8.574705 41.151942] [...,938
4,1372637091620000337,C,,,20000337,1372637091,A,False,[[-8.645994 41.18049] [-8.645949 41.180517] [-...,634


In [5]:
coords = df["POLYLINE"][0]
coords


'[[-8.618643 41.141412] [-8.618499 41.141376] [-8.620326 41.14251] [-8.622153 41.143815] [-8.623953 41.144373] [-8.62668 41.144778] [-8.627373 41.144697] [-8.630226 41.14521] [-8.632746 41.14692] [-8.631738 41.148225] [-8.629938 41.150385] [-8.62911 41.151213] [-8.629128 41.15124] [-8.628786 41.152203] [-8.628687 41.152374] [-8.628759 41.152518] [-8.630838 41.15268] [-8.632323 41.153022] [-8.631144 41.154489] [-8.630829 41.154507] [-8.630829 41.154516] [-8.630829 41.154498] [-8.630838 41.154489]]'

In [6]:
import re

pattern = r"\[(-?\d+\.\d+) (-?\d+\.\d+)\]"
matches = re.findall(pattern, coords)
for match in matches:
    print(f"x: {match[0]}, y: {match[1]}")

x: -8.618643, y: 41.141412
x: -8.618499, y: 41.141376
x: -8.620326, y: 41.14251
x: -8.622153, y: 41.143815
x: -8.623953, y: 41.144373
x: -8.62668, y: 41.144778
x: -8.627373, y: 41.144697
x: -8.630226, y: 41.14521
x: -8.632746, y: 41.14692
x: -8.631738, y: 41.148225
x: -8.629938, y: 41.150385
x: -8.62911, y: 41.151213
x: -8.629128, y: 41.15124
x: -8.628786, y: 41.152203
x: -8.628687, y: 41.152374
x: -8.628759, y: 41.152518
x: -8.630838, y: 41.15268
x: -8.632323, y: 41.153022
x: -8.631144, y: 41.154489
x: -8.630829, y: 41.154507
x: -8.630829, y: 41.154516
x: -8.630829, y: 41.154498
x: -8.630838, y: 41.154489


Here, we create a `geometry` column for working with the `LineString` data type

In [7]:
from shapely.geometry import LineString


def add_geometry_column(df: pd.DataFrame):
    temp_all = []
    pattern = r"\[(-?\d+\.\d+) (-?\d+\.\d+)\]"
    df["geometry"] = ""
    df["geometry"] = df["geometry"].astype("object")
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        temp = []
        # i = 1
        # temp_coords = "".join(row["POLYLINE"])
        matches = re.findall(pattern, "".join(row["POLYLINE"]))
        # for coord in temp_coords:
        #     if i % 2 == 0:
        #         b = float(coord)
        #         temp.append((a, b))
        #     else:
        #         a = float(coord)

        #     i += 1
        for match in matches:
            temp.append((float(match[0]), float(match[1])))
            # print(f"x: {match[0]}, y: {match[1]}")

        temp_all.append(temp)
        df.at[index, "geometry"] = temp
    df["geo_len"] = df["geometry"].apply(lambda x: len(x))
    df.drop(df[df["geo_len"] < 2].index, axis=0, inplace=True)
    df["geometry"] = df["geometry"].apply(lambda x: LineString(x))


add_geometry_column(df)
df.head()

100%|██████████| 1710660/1710660 [02:48<00:00, 10149.09it/s]


Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,geo_len,geometry
0,1372636858620000589,C,,,20000589,1372636858,A,False,[[-8.618643 41.141412] [-8.618499 41.141376] [...,23,"LINESTRING (-8.618643 41.141412, -8.618499 41...."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,[[-8.639847 41.159826] [-8.640351 41.159871] [...,19,"LINESTRING (-8.639847 41.159826, -8.640351 41...."
2,1372636951620000320,C,,,20000320,1372636951,A,False,[[-8.612964 41.140359] [-8.613378 41.14035] [-...,65,"LINESTRING (-8.612964 41.140359, -8.613378 41...."
3,1372636854620000520,C,,,20000520,1372636854,A,False,[[-8.574678 41.151951] [-8.574705 41.151942] [...,43,"LINESTRING (-8.574678 41.151951, -8.574705 41...."
4,1372637091620000337,C,,,20000337,1372637091,A,False,[[-8.645994 41.18049] [-8.645949 41.180517] [-...,29,"LINESTRING (-8.645994 41.18049, -8.645949 41.1..."


Now that the data is formatted properly, we can use `geopandas`

In [8]:
import geopandas as gpd
from shapely import wkt

gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,geo_len,geometry
0,1372636858620000589,C,,,20000589,1372636858,A,False,[[-8.618643 41.141412] [-8.618499 41.141376] [...,23,"LINESTRING (-8.61864 41.14141, -8.61850 41.141..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,[[-8.639847 41.159826] [-8.640351 41.159871] [...,19,"LINESTRING (-8.63985 41.15983, -8.64035 41.159..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,[[-8.612964 41.140359] [-8.613378 41.14035] [-...,65,"LINESTRING (-8.61296 41.14036, -8.61338 41.140..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,[[-8.574678 41.151951] [-8.574705 41.151942] [...,43,"LINESTRING (-8.57468 41.15195, -8.57470 41.151..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,[[-8.645994 41.18049] [-8.645949 41.180517] [-...,29,"LINESTRING (-8.64599 41.18049, -8.64595 41.180..."


## Assign labels

This is the major portion of the project, as deciding whether a user is lost, parking, or driving normally is the objective. We will use the `geopandas` library to help us with this task later.

In [None]:
# TODO: take polylines and convert to labels `['LOST','PARKING','NORMAL']`

## Google Maps API Setup
For geocoding and reverse geocoding, we will use the Google Maps API. You will need to set up a Google Cloud Platform account and enable the Google Maps API. You will also need to create an API key. You can find instructions on how to do this here: https://developers.google.com/maps/documentation/geocoding/get-api-key. For our application, we put the API key in a file called `GOOGLE_MAPS_API_KEY.txt` in the same directory as this notebook.

In [9]:
from geopy.geocoders import GoogleV3
api_key_file = "GOOGLE_MAPS_API_KEY.txt"

In [10]:
with open(api_key_file, "r") as f:
    api_key = f.read()
adapter = GoogleV3(api_key=api_key)

We'll use the Chihuly Garden and Glass museum in Seattle as an example. We'll first geocode the address to get the latitude and longitude coordinates. From there, we can find nearest parking spaces to the museum and display them appropriately. This is another portion of the project we are working on

In [11]:
glass_garden_address = "305 Harrison St, Seattle, WA 98109"  # Chihuly Garden and Glass
loc = adapter.geocode(glass_garden_address)
loc

Location(305 Harrison St, Seattle, WA 98109, USA, (47.6228059, -122.353007, 0.0))

In [12]:
lat, lon = loc.latitude, loc.longitude
loc.point

Point(47.6228059, -122.353007, 0.0)

### Find parking spots near location
https://stackoverflow.com/questions/23025011/google-place-api-for-parking-spots

https://developers.google.com/maps/documentation/places/web-service/search-nearby#PlaceSearchRequests

In [14]:
import requests

request_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?"
radius = 1000 # meters
type_ = "parking"
location = f"{lat},{lon}"
request_url = f"{request_url}location={location}&radius={radius}&type={type_}&key={api_key}"
# request_url

In [15]:
# headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
res = requests.get(request_url).json()
res["results"][0]

{'business_status': 'OPERATIONAL',
 'geometry': {'location': {'lat': 47.62477159999999, 'lng': -122.353641},
  'viewport': {'northeast': {'lat': 47.6261295802915,
    'lng': -122.3523800197085},
   'southwest': {'lat': 47.62343161970851, 'lng': -122.3550779802915}}},
 'icon': 'https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/parking-71.png',
 'icon_background_color': '#7B9EB0',
 'icon_mask_base_uri': 'https://maps.gstatic.com/mapfiles/place_api/icons/v2/parking_pinlet',
 'name': 'U park',
 'opening_hours': {'open_now': True},
 'photos': [{'height': 3024,
   'html_attributions': ['<a href="https://maps.google.com/maps/contrib/116898915965680338772">Ellie Lockhart</a>'],
   'photo_reference': 'AcJnMuEWkNLov14LZXNW7WXxMpEZJ5DAtAuSaxd8vrlDKA-FbcZ3SNokctv4TANBVm44SHltgxMM0wSZhdxmTB-PRAiYFVpaGfvgU9Q_Np-AWby-PlfbtZEzJjAa6PLqoge0C8SkyiqtAfInbhxbxLF4x1K95XAeuNLGvdNrklLcx8sanU5p',
   'width': 4032}],
 'place_id': 'ChIJq1yHREEVkFQR0cI7TcmJH6Q',
 'plus_code': {'compound_code': 'JJFW+WG 

In [16]:
def get_nearest_parking(lat: int = 0.0, lon: int = 0.0, query: str = None, radius=1000):
    if query is not None:
        loc = adapter.geocode(query)
        lat, lon = loc.latitude, loc.longitude
    request_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?"
    location = f"{lat},{lon}"
    request_url = (
        f"{request_url}location={location}&radius={radius}&type=parking&key={api_key}"
    )
    res = requests.get(request_url).json()

    return res["results"]


nearby_parking = get_nearest_parking(lat, lon)
nearby_parking[0]["geometry"]["location"]

{'lat': 47.62477159999999, 'lng': -122.353641}

In [17]:
for parking in nearby_parking:
    lat = parking["geometry"]["location"]["lat"]
    lng = parking["geometry"]["location"]["lng"]
    print(parking["vicinity"], (lat, lng))


172 Mercer Street, Seattle (47.62477159999999, -122.353641)
220 1st Avenue North, Seattle (47.62035909999999, -122.3545646)
Parking lot, 5 West Harrison Street, Seattle (47.6219153, -122.3569417)
622 1st Avenue West, Seattle (47.6251632, -122.3576404)
100 West Mercer Place, Seattle (47.6248393, -122.3582943)
516 Harrison Street, Seattle (47.6222115, -122.3468784)
710 4th Avenue North, Seattle (47.6258037, -122.3485209)
710 4th Avenue North, Seattle (47.62580819999999, -122.3484942)
380 Broad Street, Seattle (47.6189839, -122.3492485)
Parking lot, 525 2nd Avenue West, Seattle (47.6241239, -122.359721)
3161 Elliott Avenue West, Seattle (47.61879380000001, -122.3585127)
3161 Elliott Avenue Suite 100, Seattle (47.618367, -122.358577)
619 Roy Street, Seattle (47.6256716, -122.3451308)
555 Vine Street, Seattle (47.618237, -122.3456354)
347 Vine Street, Seattle (47.6168334, -122.3469738)
714 Denny Way, Seattle (47.619214, -122.3432616)
2400 4th Avenue, Seattle (47.61653829999999, -122.3453164