In [1]:
import hashlib
import json
import os
import time

from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
from math import atan2, fabs, pi, pow, sqrt
from multiprocessing import cpu_count, Pool

import geopandas as gpd
import joblib
import numpy as np
import pandas as pd
import requests

from pytz import timezone
from requests import Session
from shapely.geometry import LineString, Point

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling

with open('../.config/connections.json') as json_file:  
    connections = json.load(json_file)

In [2]:
plt.rcParams["figure.figsize"] = (16,10)

In [3]:
end_time = datetime(2019, 9, 30)
start_time = end_time - timedelta(hours=1)
trip_params = {
    'min_end_time': int(start_time.timestamp()) * 1000,
    'max_end_time': int(end_time.timestamp()) * 1000
}
event_params = {
    'start_time': int(start_time.timestamp()) * 1000,
    'end_time': int(end_time.timestamp()) * 1000
}

In [4]:
print(trip_params)
print(event_params)

{'min_end_time': 1569798000000, 'max_end_time': 1569801600000}
{'start_time': 1569798000000, 'end_time': 1569801600000}


In [5]:
def _request(url, payload_key, params=None, results=[]):
        """
        Internal helper for sending requests.

        Returns payload(s).
        """
        retries = 0
        res = None

        while res is None:
            try:
                res = session.get(url, params=params)
                res.raise_for_status()
            except Exception as err:
                res = None
                retries = retries + 1
                if retries > 3:
                    raise Exception(
                        f"Unable to retrieve response from {url} after {3}.  Aborting...")

                print(
                    f"{err}. Retrying in 10 seconds... (retry {retries}/3)")
                time.sleep(10)

        if "Content-Type" in res.headers:
            cts = res.headers["Content-Type"].split(";")
            if "application/vnd.mds.provider+json" not in cts:
                print(
                    f"Incorrect content-type returned: {res.headers['Content-Type']}")
            cts = cts[1:]
            for ct in cts:
                if ct.strip().startswith("charset"):
                    pass
                if not ct.strip().startswith(f"version=0.3"):
                    print(
                        f"Incorrect content-type returned: {res.headers['Content-Type']}")
        else:
            print(f"Missing {self.version} content-type header.")

        page = res.json()

        if page["data"] is not None:
            results.extend(page["data"][payload_key])

        if "links" in page:
            next_page = page["links"].get("next")
            if next_page is not None:
                results = _request(url=next_page, payload_key=payload_key,
                                        results=results)

        return results

In [6]:
version = '0.3'
trips = []
for provider in ['Lime', 'Spin', 'Razor', 'Shared']:
    c = connections[provider]
    
    session = Session()
    if 'extra' in c:
        if 'headers' in c['extra']:
            session.headers.update(c['extra']['headers'])
    if 'token_url' in c:
        res = session.post(c['token_url'], data=c['auth_payload'])
        session.headers.update({'Authorization': f'Bearer {res.json()[c["token_key"] if "token_key" in c else "token"]}'})
    session.headers.update({"Accept": f"application/vnd.mds.provider+json;version={version}"})
    
    trips.extend(_request(c['host'].replace(':endpoint', 'trips').strip(), 'trips', params=trip_params))

Incorrect content-type returned: application/json; charset=utf-8
Incorrect content-type returned: application/json; charset=utf-8
Incorrect content-type returned: application/vnd.mds.provider+json; charset=utf-8; version=0.3
Incorrect content-type returned: application/vnd.mds.provider+json; charset=utf-8; version=0.3.0


In [7]:
# Get trips as a GeoDataFrame
trips = gpd.GeoDataFrame(pd.DataFrame.from_records(trips).drop_duplicates(subset='trip_id'))
trips.crs = {'init': 'epsg:4326'}

# Convert the route to a DataFrame now to make mapping easier
trips['route'] = trips.route.map(lambda x: x['features'])
trips['propulsion_type'] = trips.propulsion_type.map(
    lambda x: ','.join(sorted(x)))

trips[['trip_id', 'geometry']].to_file('../.data/shst_method_test_trips.geojson', driver='GeoJSON')

In [158]:
lens = [len(item) for item in trips['route']]

route_df = pd.DataFrame({
    "trip_id": np.repeat(trips['trip_id'].values, lens),
    "provider_id": np.repeat(trips['provider_id'].values, lens),
    "device_id": np.repeat(trips['device_id'].values, lens),
    "vehicle_type": np.repeat(trips['vehicle_type'].values, lens),
    "propulsion_type": np.repeat(trips['propulsion_type'].values, lens),
    "feature": np.concatenate(trips['route'].values)
})

route_df['timestamp'] = route_df.feature.map(
    lambda x: x['properties']['timestamp'])
route_df['coordinates'] = route_df.feature.map(
    lambda x: x['geometry']['coordinates'])
route_df['geometry'] = route_df.feature.map(
    lambda x: Point(x['geometry']['coordinates']))

kmeans = joblib.load('./kmeans.pkl')

coord_array = []
route_df.coordinates.map(lambda x: coord_array.append([x[0], x[1]]))
coord_array = np.array(coord_array)
route_df['cluster'] = kmeans.predict(coord_array)

route_df = gpd.GeoDataFrame(route_df.sort_values(
    by=['trip_id', 'timestamp'], ascending=True
).reset_index(drop=True).copy())
route_df.crs = {'init': 'epsg:4326'}
route_df['datetime'] = route_df.timestamp.map(
    lambda x: datetime.fromtimestamp(x / 1000).astimezone(timezone('US/Pacific')))
route_df['datetime'] = route_df.datetime.dt.round('L')
route_df['datetime'] = route_df.datetime.map(
    lambda x: datetime.replace(x, tzinfo=None))
route_df['date_key'] = route_df.datetime.map(
    lambda x: int(x.strftime('%Y%m%d')))
# Generate a hash to aid in merge operations
route_df['hash'] = route_df.apply(lambda x: hashlib.md5((
    f'{x.trip_id}{x.device_id}{x.provider_id}{x.timestamp}'
).encode('utf-8')).hexdigest(), axis=1)
route_df['datetime'] = route_df.datetime.map(
    lambda x: x.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3])
route_df = route_df.drop_duplicates(subset='hash')



In [159]:
route_df = route_df.to_crs(epsg=3857)

route_df['x'] = route_df.geometry.map(lambda g: g.x)
route_df['y'] = route_df.geometry.map(lambda g: g.y)

route_by_trip = route_df.groupby(['trip_id'])

route_df['nt'] = route_by_trip.timestamp.shift(-1)
route_df['nx'] = route_by_trip.x.shift(-1)
route_df['ny'] = route_by_trip.y.shift(-1)

# drop destination
route_df = route_df.dropna().copy()

route_df['dx'] = route_df.nx - route_df.x
route_df['dy'] = route_df.ny - route_df.y
route_df['dt'] = (route_df.nt - route_df.timestamp) / 1000

def find_bearing(hit):
    return atan2(hit.dx, hit.dy) / pi * 180

def find_speed(hit):
    if hit['dt'] <= 0:
        return 0

    d = sqrt(pow((hit.dx), 2) + pow((hit.dy), 2))

    return d / hit['dt']

route_df['bearing'] = route_df.apply(find_bearing, axis=1)
route_df['speed'] = route_df.apply(find_speed, axis=1)

route_df = route_df.to_crs(epsg=4326)

  return _prepare_from_string(" ".join(pjargs))
  return _prepare_from_string(" ".join(pjargs))


In [160]:
route_df.head()

Unnamed: 0,trip_id,provider_id,device_id,vehicle_type,propulsion_type,feature,timestamp,coordinates,geometry,cluster,...,x,y,nt,nx,ny,dx,dy,dt,bearing,speed
0,00d5d8a0-e30f-11e9-bce6-8f8fb8519f77,b2adaf60-dd65-410b-b78a-6581501fe988,b7a70640-c964-11e9-aae5-0b94de534dd5,scooter,electric,"{'type': 'Feature', 'properties': {'timestamp'...",1569798918000,"[-122.65753346200975, 45.53147996910991]",POINT (-122.65753 45.53148),11,...,-13654170.0,5705584.0,1569800000000.0,-13655820.0,5704098.0,-1645.998938,-1486.209183,637.0,-132.079585,3.481453
2,04d8e164-fe1b-48a3-9a67-33c218cf6841,70aa475d-1fcd-4504-b69c-2eeb2107f7be,1fa7e7c8-1441-4b67-bbd2-d997a57f1d96,scooter,electric,"{'type': 'Feature', 'geometry': {'type': 'Poin...",1569801043734,"[-122.6815478, 45.5277024]",POINT (-122.68155 45.52770),429,...,-13656850.0,5704984.0,1569801000000.0,-13656820.0,5705002.0,29.81136,18.686629,9.507,57.919315,3.700842
3,04d8e164-fe1b-48a3-9a67-33c218cf6841,70aa475d-1fcd-4504-b69c-2eeb2107f7be,1fa7e7c8-1441-4b67-bbd2-d997a57f1d96,scooter,electric,"{'type': 'Feature', 'geometry': {'type': 'Poin...",1569801053241,"[-122.68128, 45.52782]",POINT (-122.68128 45.52782),55,...,-13656820.0,5705002.0,1569801000000.0,-13656820.0,5705002.0,0.0,0.0,4.864,0.0,0.0
4,04d8e164-fe1b-48a3-9a67-33c218cf6841,70aa475d-1fcd-4504-b69c-2eeb2107f7be,1fa7e7c8-1441-4b67-bbd2-d997a57f1d96,scooter,electric,"{'type': 'Feature', 'geometry': {'type': 'Poin...",1569801058105,"[-122.68128, 45.52782]",POINT (-122.68128 45.52782),55,...,-13656820.0,5705002.0,1569801000000.0,-13656820.0,5705002.0,0.0,0.0,5.073,0.0,0.0
5,04d8e164-fe1b-48a3-9a67-33c218cf6841,70aa475d-1fcd-4504-b69c-2eeb2107f7be,1fa7e7c8-1441-4b67-bbd2-d997a57f1d96,scooter,electric,"{'type': 'Feature', 'geometry': {'type': 'Poin...",1569801063178,"[-122.68128, 45.52782]",POINT (-122.68128 45.52782),55,...,-13656820.0,5705002.0,1569801000000.0,-13656820.0,5705002.0,0.0,0.0,4.975,0.0,0.0


In [132]:
len(route_df)

22835

In [154]:
gpd.GeoDataFrame(
    route_df[[
        'hash',
        'trip_id',
        'speed',
        'bearing',
        'geometry'
    ]]
).to_file('../.data/shst_method_test_points.geojson', driver='GeoJSON')

In [13]:
session.headers.update({"Content-Type": "application/json"})
session.headers.update({"Accept": "application/json"})
def _request(session, url, data):
    """
    Internal helper for sending requests.

    Returns payload(s).
    """
    retries = 0
    res = None

    while res is None:
        try:
            res = session.post(url, data=data)
            res.raise_for_status()
        except Exception as err:
            res = None
            retries = retries + 1
            if retries > 3:
                print(
                    f"Unable to retrieve response from {url} after 3 tries.  Aborting...")
                return res

            print(
                f"Error while retrieving: {err}. Retrying in 10 seconds... (retry {retries}/3)")
            time.sleep(10)

    return res

In [14]:
shst_df = route_df[['hash', 'geometry', 'cluster']].groupby('cluster').apply(
    lambda x: {
        'type': 'FeatureCollection',
        'features': x.apply(lambda x: {
            'type': 'Feature',
            'properties': {
                'hash': x.hash
            },
            'geometry': {
                'type': x.geometry.geom_type,
                'coordinates': np.array(x.geometry).tolist()
            }
        }, axis=1).values.tolist()
    })

In [15]:
executor = ThreadPoolExecutor(max_workers=cpu_count()*4)

In [16]:
shst_points = shst_df.map(lambda x: executor.submit(
    _request, session, 'http://sharedstreets:3000/api/v1/match/point/bike/20', json.dumps(x)))

Error while retrieving: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')). Retrying in 10 seconds... (retry 1/3)
Error while retrieving: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')). Retrying in 10 seconds... (retry 1/3)Error while retrieving: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')). Retrying in 10 seconds... (retry 1/3)

Error while retrieving: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')). Retrying in 10 seconds... (retry 1/3)
Error while retrieving: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')). Retrying in 10 seconds... (retry 1/3)
Error while retrieving: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')). Retrying in 10 seconds... (retry 1/3)
Error while retrieving: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')). Retrying in 10 seconds... (retry 1/3)
Error 

In [20]:
shst_points.map(lambda x: 1 if x.done() else 0).sum() / len(shst_points) * 100

100.0

In [21]:
def safe_result(x):
    try:
        return x.result().json()
    except:
        return { 'features': [] }

In [161]:
shst_df = shst_points.map(safe_result)

shst_df = pd.DataFrame({
    'feature': np.concatenate(shst_df.map(lambda x: x['features']).values)
})

shst_df['hash'] = shst_df.feature.map(lambda x: x['properties']['hash'])
shst_df['candidates'] = shst_df.feature.map(
    lambda x: x['properties']['shstCandidates'])

In [162]:
route_df = route_df.merge(shst_df, on='hash')

lens = [len(item) for item in route_df['candidates']]
route_df = pd.DataFrame({
    'date_key': np.repeat(route_df['date_key'].values, lens),
    'hash': np.repeat(route_df['hash'].values, lens),
    'datetime': np.repeat(route_df['datetime'].values, lens),
    'trip_id': np.repeat(route_df['trip_id'].values, lens),
    'provider_id': np.repeat(route_df['provider_id'].values, lens),
    'vehicle_type': np.repeat(route_df['vehicle_type'].values, lens),
    'propulsion_type': np.repeat(route_df['propulsion_type'].values, lens),
    'bearing': np.repeat(route_df['bearing'].values, lens),
    'speed': np.repeat(route_df['speed'].values, lens),
    'geometry': np.repeat(route_df['geometry'].values, lens),
    'candidate': np.concatenate(route_df['candidates'].values),
})

In [163]:
route_df['nearby_shst_geometry_id'] = route_df.candidate.map(
    lambda x: x['geometryId'])
route_df['nearby_shst_reference_id'] = route_df.candidate.map(
    lambda x: x['referenceId'])
route_df['nearby_shst_classification'] = route_df.candidate.map(
    lambda x: x['roadClass'])
route_df['nearby_shst_bearing'] = route_df.candidate.map(
    lambda x: x['bearing'])
route_df['nearby_shst_geometry_distance'] = route_df.candidate.map(
    lambda x: x['score'] if 'score' in x else 0)

def normalizeAngle(angle):
    if angle > 180:
        angle = angle - 360
    return angle

def getDifference(b1, b2):
	r = (b2 - b1) % 360.0
	# Python modulus has same sign as divisor, which is positive here,
	# so no need to consider negative case
	if r >= 180.0:
		r -= 360.0
	return r

route_df['nearby_bearing_diff'] = route_df.apply(lambda x:fabs(getDifference(x.bearing, fabs(normalizeAngle(x.nearby_shst_bearing)))), axis=1)

In [140]:
len(route_df)

45940

In [164]:
gpd.GeoDataFrame(
    route_df.drop_duplicates(
        subset=['trip_id', 'nearby_shst_geometry_id'],
        keep='last'
    )[[
        'hash',
        'trip_id',
        'nearby_shst_geometry_id',
        'nearby_shst_reference_id',
        'nearby_shst_classification',
        'bearing',
        'nearby_shst_geometry_distance',
        'nearby_bearing_diff',
        'speed',
    ]].merge(
        shst_segments[['geometry_id', 'geometry']],
        left_on=['nearby_shst_geometry_id'],
        right_on=['geometry_id']
    )
).to_file('../.data/shst_method_test_all_nearby_candidates.geojson', driver='GeoJSON')

In [165]:
gpd.GeoDataFrame(
    route_df.sort_values(
        by=['datetime', 'nearby_bearing_diff', 'nearby_shst_geometry_distance'],
        ascending=[True, True, True]
    ).drop_duplicates(
        subset=['hash']
    ).drop_duplicates(
        subset=['trip_id', 'nearby_shst_geometry_id'],
        keep='last'
    )[[
        'hash',
        'trip_id',
        'nearby_shst_geometry_id',
        'nearby_shst_reference_id',
        'nearby_shst_classification',
        'bearing',
        'nearby_shst_geometry_distance',
        'nearby_bearing_diff',
        'speed',
    ]].merge(
        shst_segments[['geometry_id', 'geometry']],
        left_on=['nearby_shst_geometry_id'],
        right_on=['geometry_id']
    )
).to_file('../.data/shst_method_test_closest_match.geojson', driver='GeoJSON')

In [28]:
def line_from_route(route):
    if len(route) > 1:
        points = [p.get('geometry') for p in sorted(
            route, key=lambda p: p.get('properties').get('timestamp')
        )]
        return LineString([p.get('coordinates') for p in points])

trips['geometry'] = trips.route.apply(line_from_route)
trips['start_minute'] = trips.start_time.map(
    lambda x: datetime.fromtimestamp(x / 1000).astimezone(timezone('US/Pacific'))).dt.minute
shst_df = trips[~trips.geometry.isnull()][['trip_id', 'provider_id', 'start_minute', 'geometry']].groupby(['provider_id', 'start_minute']).apply(
        lambda x: {
            'type': 'FeatureCollection',
            'features': x.apply(lambda x: {
                'type': 'Feature',
                'properties': {
                    'hash': x.trip_id
                },
                'geometry': {
                    'type': x.geometry.geom_type,
                    'coordinates': np.array(x.geometry).tolist()
                }
            }, axis=1).values.tolist()
        })
shst_lines = shst_df.map(lambda x: executor.submit(
    _request, session, 'http://sharedstreets:3000/api/v1/match/line/bike/20', json.dumps(x)))

In [35]:
shst_lines.map(lambda x: 1 if x.done() else 0).sum() / len(shst_lines) * 100

100.0

In [166]:
shst_df = shst_lines.map(safe_result)

shst_df = pd.DataFrame({
    'feature': np.concatenate(shst_df.map(lambda x: x['features']).values)
})

shst_df['trip_id'] = shst_df.feature.map(
    lambda x: x['properties']['hash'])
shst_df['candidate'] = shst_df.feature.map(
    lambda x: x['properties'].get('shstCandidate'))

shst_df = shst_df[~shst_df.candidate.isnull()]

shst_df['confidence'] = shst_df.candidate.map(lambda x: x['confidence'])
shst_df['segments'] = shst_df.candidate.map(lambda x: x['segments'])

In [167]:
lens = [len(item) for item in shst_df['segments']]
shst_df = pd.DataFrame({
    'trip_id': np.repeat(shst_df['trip_id'].values, lens),
    'confidence': np.repeat(shst_df['confidence'].values, lens),
    'candidate': np.concatenate(shst_df['segments'].values),
})

shst_df['shst_geometry_id'] = shst_df['candidate'].map(
    lambda x: x.get('geometryId'))
shst_df['shst_reference_id'] = shst_df['candidate'].map(
    lambda x: x.get('referenceId'))

In [157]:
gpd.GeoDataFrame(
    shst_df[[
        'trip_id',
        'shst_geometry_id',
        'shst_reference_id',
    ]].merge(
        shst_segments[[
            'geometry_id',
            'geometry'
        ]],
        left_on=['shst_geometry_id'],
        right_on=['geometry_id']
    )
).to_file('../.data/shst_method_test_matched_line_segments.geojson', driver='GeoJSON')

In [168]:
len(shst_df)

6519

In [189]:
mega_df = gpd.GeoDataFrame(
    route_df.merge(
        shst_df[[
            'trip_id',
            'shst_geometry_id',
            'shst_reference_id',
        ]].merge(
            shst_segments[[
                'geometry_id',
                'geometry'
            ]],
            left_on=['shst_geometry_id'],
            right_on=['geometry_id']
        ),
        on='trip_id'
    )
)
mega_df.crs = {'init': 'epsg:4326'}

In [190]:
mega_df.set_geometry('geometry_x').to_crs(epsg=3857).head()

Unnamed: 0,date_key,hash,datetime,trip_id,provider_id,vehicle_type,propulsion_type,bearing,speed,geometry_x,...,nearby_shst_geometry_id,nearby_shst_reference_id,nearby_shst_classification,nearby_shst_bearing,nearby_shst_geometry_distance,nearby_bearing_diff,shst_geometry_id,shst_reference_id,geometry_id,geometry_y
0,20190929,2e921b9bc8eddeec8164fbe85b955288,2019-09-29 16:15:18.000,00d5d8a0-e30f-11e9-bce6-8f8fb8519f77,b2adaf60-dd65-410b-b78a-6581501fe988,scooter,electric,-132.079585,3.481453,POINT (-13654174.167 5705583.828),...,7006aed75df617484d65ed83b278011a,6afb329309f64868d65c898199f7fbd2,4.0,90.275189,1.893086,137.645227,6d566ee4238ad5070d592694eed4fdb5,90b6aa70c52a8de45bb2ef77e58c56f0,6d566ee4238ad5070d592694eed4fdb5,"LINESTRING (-122.67337 45.52255, -122.67336 45..."
1,20190929,2e921b9bc8eddeec8164fbe85b955288,2019-09-29 16:15:18.000,00d5d8a0-e30f-11e9-bce6-8f8fb8519f77,b2adaf60-dd65-410b-b78a-6581501fe988,scooter,electric,-132.079585,3.481453,POINT (-13654174.167 5705583.828),...,7006aed75df617484d65ed83b278011a,6afb329309f64868d65c898199f7fbd2,4.0,90.275189,1.893086,137.645227,ccd5290a525cb17c69daf98eaf7ed9f8,0f8856d3a62543833d20d3fdfe0e6d1d,ccd5290a525cb17c69daf98eaf7ed9f8,"LINESTRING (-122.66578 45.53079, -122.66567 45..."
2,20190929,2e921b9bc8eddeec8164fbe85b955288,2019-09-29 16:15:18.000,00d5d8a0-e30f-11e9-bce6-8f8fb8519f77,b2adaf60-dd65-410b-b78a-6581501fe988,scooter,electric,-132.079585,3.481453,POINT (-13654174.167 5705583.828),...,7006aed75df617484d65ed83b278011a,6afb329309f64868d65c898199f7fbd2,4.0,90.275189,1.893086,137.645227,f1d87e64765880fd1b5effcddc8e673b,213ab1b29d3a839f795d57567b1fede4,f1d87e64765880fd1b5effcddc8e673b,"LINESTRING (-122.66567 45.53080, -122.66553 45..."
3,20190929,2e921b9bc8eddeec8164fbe85b955288,2019-09-29 16:15:18.000,00d5d8a0-e30f-11e9-bce6-8f8fb8519f77,b2adaf60-dd65-410b-b78a-6581501fe988,scooter,electric,-132.079585,3.481453,POINT (-13654174.167 5705583.828),...,7006aed75df617484d65ed83b278011a,6afb329309f64868d65c898199f7fbd2,4.0,90.275189,1.893086,137.645227,fad20e30334f7008868c2727588efe50,bb1e23af59268644271b8e6427ef696c,fad20e30334f7008868c2727588efe50,"LINESTRING (-122.66472 45.53079, -122.66463 45..."
4,20190929,2e921b9bc8eddeec8164fbe85b955288,2019-09-29 16:15:18.000,00d5d8a0-e30f-11e9-bce6-8f8fb8519f77,b2adaf60-dd65-410b-b78a-6581501fe988,scooter,electric,-132.079585,3.481453,POINT (-13654174.167 5705583.828),...,7006aed75df617484d65ed83b278011a,6afb329309f64868d65c898199f7fbd2,4.0,90.275189,1.893086,137.645227,ba7924d9c0f7008e5b3341d1d35465bf,2b3aa27a12f96e39b09b1ce03a03e33d,ba7924d9c0f7008e5b3341d1d35465bf,"LINESTRING (-122.66372 45.53079, -122.66363 45..."


In [182]:
mega_df['geometry_x'] = mega_df.to_crs(epsg=3857)
mega_df['geometry_y'] = mega_df.geometry_y.to_crs(epsg=3857)
mega_df['routed_shst_geometry_distance'] = mega_df.apply(
    lambda x: x.geometry_x.distance(x.geometry_y) if x.geometry_y else np.nan,
    axis=1
)

AttributeError: 'Series' object has no attribute 'to_crs'

In [181]:
gpd.GeoDataFrame(
    mega_df[[
        'hash',
        'trip_id',
        'shst_geometry_id',
        'shst_reference_id',
        'routed_shst_geometry_distance'
    ]].sort_values(
        by=['hash', 'routed_shst_geometry_distance']
    ).drop_duplicates(
        subset='hash'
    ).merge(
        shst_segments[[
            'geometry_id',
            'geometry'
        ]],
        left_on=['shst_geometry_id'],
        right_on=['geometry_id']
    )
).to_file('../.data/shst_method_test_closest_routed_segment.geojson', driver='GeoJSON')

In [36]:
shst_segments = gpd.read_file('../.data/shst.out.geojson')

In [37]:
shst_segments = shst_segments.rename(index=str, columns={
    'id': 'geometry_id'
})

In [136]:
gpd.GeoDataFrame(
    route_df.sort_values(
        by=['datetime', 'bearing_diff', 'confidence', 'distance'],
        ascending=[True, True, False, True]
    ).drop_duplicates(
        subset=['hash'],
        keep='first'
    ).drop_duplicates(
        subset=['trip_id', 'shst_geometry_id'],
        keep='last'
    ).groupby(
        'shst_geometry_id'
    ).count()[
        'hash'
    ].reset_index().merge(
        shst_segments[[
            'geometry_id',
            'geometry'
        ]],
        left_on=['shst_geometry_id'],
        right_on=['geometry_id']
    )
).to_file('../.data/shst_method_test_counts.geojson', driver='GeoJSON')

In [115]:
route_df[[
    'route_shst_geometry_id',
    'route_reference_id',
    'route_confidence',
    'route_geometry'
]] = route_df.merge(
    shst_df[['trip_id', 'shst_geometry_id',
             'shst_reference_id', 'confidence', 'geometry']],
    on=['trip_id'],
    how='left'
)[[
    'shst_geometry_id',
    'shst_reference_id',
    'confidence',
    'geometry_y'
]]

In [122]:
route_df['route_distance'] = route_df.apply(
    lambda x: x.geometry.distance(x.route_geometry), axis=1)
route_df['same_geometry_id'] = route_df.apply(
    lambda x: x.nearby_shst_geometry_id == x.route_shst_geometry_id, axis=1)

In [124]:
route_df[route_df.same_geometry_id].head()

Unnamed: 0,date_key,hash,datetime,trip_id,provider_id,vehicle_type,propulsion_type,bearing,speed,geometry,...,nearby_shst_classification,nearby_shst_geometry_distance,shstBearing,nearby_bearing_diff,route_shst_geometry_id,route_reference_id,route_confidence,route_geometry,route_distance,same_geometry_id
2339,20190929,6cf23a2a035027b71e4826ba11b5bad5,2019-09-29 16:41:42.140,10be4cb8-f5ec-4acf-b122-3d3789084814,63f13c48-34ff-49d2-aca7-cf6a5b6171c3,scooter,"electric,human",0.0,0.0,POINT (-122.67663 45.51855),...,3.0,5.434932,20.783464,20.783464,044c948cf591e2f80b433ed5bf1248c1,2361f6327a03b8e0ab4579c341ee436c,0.001469,"LINESTRING (-122.67660 45.51847, -122.67658 45...",6.6e-05,True
8542,20190929,9dd1f57c696f1d3e0a08522334a20741,2019-09-29 16:29:33.278,362a257c-9f0e-4d9b-bae5-64a020d9ff50,63f13c48-34ff-49d2-aca7-cf6a5b6171c3,scooter,"electric,human",0.0,0.0,POINT (-122.67655 45.51835),...,3.0,0.0,20.731011,20.731011,d32562b9f0a3ade7f69e97e739285867,246adba94415cb6457c0696c6e40b831,0.001469,"LINESTRING (-122.67690 45.51792, -122.67660 45...",0.0001,True
8642,20190929,466676b12b1509f8c1b4c69b22e96162,2019-09-29 16:32:27.834,362a257c-9f0e-4d9b-bae5-64a020d9ff50,63f13c48-34ff-49d2-aca7-cf6a5b6171c3,scooter,"electric,human",0.0,0.0,POINT (-122.67253 45.51771),...,,3.295514,61.730702,61.730702,be5c03dbcf7fe06fe46764a0f5001c1b,73a7e75a9921f2151954eb4fed4b45ad,0.001469,"LINESTRING (-122.67264 45.51772, -122.67253 45...",3.1e-05,True
8752,20190929,06bd79134edae24ce983286bb4b88836,2019-09-29 16:35:12.122,362a257c-9f0e-4d9b-bae5-64a020d9ff50,63f13c48-34ff-49d2-aca7-cf6a5b6171c3,scooter,"electric,human",0.0,0.0,POINT (-122.67106 45.52454),...,4.0,5.143051,91.382852,91.382852,da9aff6b44acbec9158862a45e48a83d,33a55bc38ba43cfbf1bf9755a9c9192f,0.001469,"LINESTRING (-122.67040 45.52460, -122.67122 45...",4.6e-05,True


In [126]:
len(shst_df)

6519

In [None]:
route_df.sort_values(
    by=['trip_id','datetime', 'bearing_diff', 'confidence', 'distance'],
    ascending=[True, True, True, False, True]
).drop_duplicates(
    subset=['hash'],
    keep='first'
).drop_duplicates(
    subset=['trip_id', 'shst_geometry_id'],
    keep='last'
).