In [None]:
!pip install fuzzymatcher

In [None]:
# Imports
import json
from math import radians

import fuzzymatcher
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import haversine_distances

from google.colab import drive

pd.options.display.max_rows = 999

drive.mount('/gdrive')

In [None]:
trips_df = pd.read_csv('/gdrive/My Drive/Tigyore/data/nyc_trips/nyc_trips_data/trips_processed.csv')
trips_df = trips_df.loc[:, ['id', 'trip_seq_no', 'type', 'service', 'lat', 'long', 'location']]
trips_df.head(20)

# Map Subway Routes

In [None]:
subway_trips_df = trips_df.loc[trips_df['type'].str.lower() == 'subway']
subway_trips_df.head()

In [None]:
unique_services_df = subway_trips_df['service'].value_counts().reset_index() \
                        .drop(['service'], axis=1) \
                        .rename({'index': 'service'}, axis=1)
unique_services_df

In [None]:
subway_services_df = pd.read_csv('/gdrive/My Drive/Tigyore/data/nyc/nycSubwayServices.csv')
subway_services_df

In [None]:
subway_route_mapping_df = fuzzymatcher.fuzzy_left_join(unique_services_df, subway_services_df, left_on = "service", right_on = "route_id")
subway_route_mapping_df

In [None]:
final_subway_trips_df = subway_trips_df.merge(subway_route_mapping_df.loc[:, ['service', 'route_id']], how='left', on='service')
final_subway_trips_df['route_id'] = final_subway_trips_df.apply(lambda row: row['route_id'] if pd.notna(row['route_id']) else row['service'], axis=1)

final_subway_trips_df.head()

In [None]:
subway_trips_df.shape, final_subway_trips_df.shape

In [None]:
final_subway_trips_df.loc[final_subway_trips_df['id'] == 20]

# Map Train Stations

In [None]:
train_stations_df = pd.read_csv('/gdrive/My Drive/Tigyore/data/nyc/nycSubwayEdges.csv')
train_stations_df.head()

In [None]:
subway_station_mapping_df = pd.DataFrame()
total_unique = 0
for route_id in final_subway_trips_df['route_id'].dropna().unique():
    unique_stations_df = final_subway_trips_df.loc[final_subway_trips_df['route_id'] == route_id][['location', 'lat', 'long']] \
                            .drop_duplicates('location') \
                            .rename({'location': 'station_name'}, axis=1)
    total_unique += unique_stations_df.shape[0]
    unique_stops_df = train_stations_df.loc[train_stations_df['route_id'] == route_id][['route_id', 'stop_id', 'stop_name', 'stop_lat', 'stop_lon']].reset_index(drop=True)
    mapping_df = fuzzymatcher.fuzzy_left_join(unique_stations_df, unique_stops_df, left_on = "station_name", right_on = "stop_name")

    # find_nearest_station_for_neg_match_score
    lat_lon = [(row['stop_lat'], row['stop_lon']) for _, row in unique_stops_df.iterrows()]
    lat_lon = [(radians(l[0]), radians(l[1])) for l in lat_lon]
    for row_idx, row in mapping_df.loc[(mapping_df['best_match_score'] < 0) | (mapping_df['best_match_score'].isna())].iterrows():
        lat, lng = row['lat'], row['long']
        idx = np.argmin(haversine_distances([(radians(lat), radians(lng))] + lat_lon)[0][1:])
        mapping_df.loc[row_idx, 'route_id'] = unique_stops_df.loc[idx]['route_id']
        mapping_df.loc[row_idx, 'stop_id'] = unique_stops_df.loc[idx]['stop_id']
        mapping_df.loc[row_idx, 'stop_name'] = unique_stops_df.loc[idx]['stop_name']
        mapping_df.loc[row_idx, 'stop_lat'] = unique_stops_df.loc[idx]['stop_lat']
        mapping_df.loc[row_idx, 'stop_lon'] = unique_stops_df.loc[idx]['stop_lon']

    subway_station_mapping_df = pd.concat([subway_station_mapping_df, mapping_df])

subway_station_mapping_df.sort_values(['best_match_score'])

In [None]:
final_subway_trips_df = final_subway_trips_df.rename({'location': 'station_name'}, axis=1) \
    .merge(subway_station_mapping_df.loc[:, ['route_id', 'station_name', 'stop_id']], how='left', on=['route_id', 'station_name'])
final_subway_trips_df['stop_id'] = final_subway_trips_df.apply(lambda row: row['stop_id'] if pd.notna(row['stop_id']) else row['station_name'], axis=1)
final_subway_trips_df.head()

In [None]:
final_subway_trips_df.shape[0], subway_trips_df.shape[0]

In [None]:
final_subway_trips_df.to_csv('/gdrive/My Drive/Tigyore/data/nyc_trips/nyc_trips_mapped/nyc_trips_subway.csv', index=False)

# Map Bus Routes

In [None]:
bus_trips_df = trips_df.loc[trips_df['type'].str.lower() == 'bus']
bus_trips_df.head()

In [None]:
unique_services_df = bus_trips_df['service'].value_counts().reset_index() \
                        .drop(['service'], axis=1) \
                        .rename({'index': 'service'}, axis=1)
unique_services_df.head()

In [None]:
all_routes_df = pd.read_csv('/gdrive/My Drive/Tigyore/data/nyc/bus_svc.csv')
all_routes_df.head()

In [None]:
bus_route_mapping_df = fuzzymatcher.fuzzy_left_join(unique_services_df, all_routes_df, left_on = "service", right_on = "bus_svc")
bus_route_mapping_df.sort_values('best_match_score')

In [None]:
final_bus_trips_df = bus_trips_df.merge(bus_route_mapping_df.loc[:, ['service', 'bus_svc']], how='left', on='service')
final_bus_trips_df['bus_svc'] = final_bus_trips_df.apply(lambda row: row['bus_svc'] if pd.notna(row['bus_svc']) else row['service'], axis=1)

final_bus_trips_df.head()

# Map Bus Stops

In [None]:
bus_stops_df = pd.read_csv('/gdrive/My Drive/Tigyore/data/nyc/bus_routes.csv')
bus_stops_df = bus_stops_df.iloc[:, :7]
bus_stops_df.head()

In [None]:
bus_stop_mapping_df = pd.DataFrame()

for bus_svc in final_bus_trips_df['bus_svc'].dropna().unique():
    unique_stops_df = final_bus_trips_df.loc[final_bus_trips_df['bus_svc'] == bus_svc][['location', 'lat', 'long']] \
                            .drop_duplicates('location') \
                            .rename({'location': 'stop_name'}, axis=1)
    all_stops_df = bus_stops_df.loc[bus_stops_df['route_id'] == bus_svc][['route_id', 'origin_stop_id', 'origin_stop_name', 'origin_lat', 'origin_lon']].reset_index(drop=True)

    if (unique_stops_df.shape[0] != 0) and (all_stops_df.shape[0] != 0):
        mapping_df = fuzzymatcher.fuzzy_left_join(unique_stops_df, all_stops_df, left_on = "stop_name", right_on = "origin_stop_name")

        # find_nearest_station_for_neg_match_score
        lat_lon = [(row['origin_lat'], row['origin_lon']) for _, row in all_stops_df.iterrows()]
        lat_lon = [(radians(l[0]), radians(l[1])) for l in lat_lon]
        for row_idx, row in mapping_df.loc[(mapping_df['best_match_score'] < 0) | (mapping_df['best_match_score'].isna())].iterrows():
            lat, lng = row['lat'], row['long']
            idx = np.argmin(haversine_distances([(radians(lat), radians(lng))] + lat_lon)[0][1:])
            mapping_df.loc[row_idx, 'route_id'] = all_stops_df.loc[idx]['route_id']
            mapping_df.loc[row_idx, 'origin_stop_id'] = all_stops_df.loc[idx]['origin_stop_id']
            mapping_df.loc[row_idx, 'origin_stop_name'] = all_stops_df.loc[idx]['origin_stop_name']
            mapping_df.loc[row_idx, 'origin_lat'] = all_stops_df.loc[idx]['origin_lat']
            mapping_df.loc[row_idx, 'origin_lon'] = all_stops_df.loc[idx]['origin_lon']
        
        bus_stop_mapping_df = pd.concat([bus_stop_mapping_df, mapping_df])

bus_stop_mapping_df.sort_values(['best_match_score']).head(100)

In [None]:
final_bus_trips_df = final_bus_trips_df.rename({'location': 'stop_name', 'bus_svc': 'route_id'}, axis=1) \
                        .merge(bus_stop_mapping_df.loc[:, ['route_id', 'stop_name', 'origin_stop_id']], how='left', on=['route_id', 'stop_name'])
final_bus_trips_df['origin_stop_id'] = final_bus_trips_df.apply(lambda row: row['origin_stop_id'] if pd.notna(row['origin_stop_id']) else row['stop_name'], axis=1)
final_bus_trips_df.head()

In [None]:
final_bus_trips_df.to_csv('/gdrive/My Drive/Tigyore/data/nyc_trips/nyc_trips_mapped/nyc_trips_bus.csv', index=False)