In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.distance import geodesic

In [7]:
if os.path.exists('/content/drive/My Drive/Capstone/Data'):
    base_dir = '/content/drive/My Drive/Capstone/Data'
else:
    base_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'Data'))

In [8]:
def add_route_usage_counts(routes):
    """
    Adds a 'pair_count' column to a GeoDataFrame that indicates how many times each route
    segment (defined by start and end stop names) appears in the data.

    Args:
        routes (GeoDataFrame): GeoDataFrame containing 'start_name' and 'end_name' columns
                               representing route segments.

    Returns:
        GeoDataFrame: The original GeoDataFrame with an additional 'pair_count' column.
    """
    line_counts = (
        routes.groupby(['start_name', 'end_name'])
        .size()
        .reset_index(name='pair_count')
    )

    routes = routes.merge(
        line_counts,
        on=['start_name', 'end_name'],
        how='left'
    )

    return routes

In [9]:
routes = gpd.read_file(os.path.join(base_dir, "yandex_scraped/full_routes.geojson"))
routes_with_counts = add_route_usage_counts(routes)

In [10]:
routes_with_counts_upd = routes_with_counts.sort_values(by='pair_count', ascending=False)
routes_with_counts_upd = routes_with_counts_upd.drop_duplicates(subset=['start_name', 'end_name'])
routes_with_counts_upd.head(10)

Unnamed: 0,transport_id,transport_name,line_id,start_id,start_name,end_id,end_name,geometry,pair_count
5548,4049075598,42,3598836544,4939954886,Garegin Nzhdeh Square,1724546640,Garegin Nzhdeh Square,"LINESTRING (44.48462 40.15022, 44.48452 40.150...",28
2008,3871424950,36,1704866580,1543190261,Khachatur Abovyan Square,1734372701,Agricultural University,"LINESTRING (44.52742 40.19109, 44.52666 40.190...",25
2754,4047037038,30,1733293487,1733131921,Medical University,1738680271,Khachatur Abovyan Square,"LINESTRING (44.52473 40.1886, 44.52506 40.1888...",24
5226,4050526288,77,4044547438,1735486131,Raffi / Sebastia,1727402401,Fair Malatia,"LINESTRING (44.45714 40.17405, 44.45704 40.174...",21
528,3862871300,53,1704882064,1543191068,Raffi / Zoravar Andranik,2061842924,Raffi / Jivani,"LINESTRING (44.44692 40.17413, 44.44867 40.174...",21
3247,4045542728,2,1704917592,1727402401,Fair Malatia,1543191067,Raffi / Zoravar Andranik,"LINESTRING (44.45228 40.17477, 44.45226 40.174...",21
2058,3871452010,36,1704866580,1755790566,Mashtots / Amiryan,1543190426,Mashtots / Tumanyan,"LINESTRING (44.50835 40.1814, 44.50897 40.1819...",20
1711,3865932510,22,1704863994,1543190436,Tigran Mets / Bypass road Saralanch,1543190456,Railway station,"LINESTRING (44.51174 40.15972, 44.51158 40.159...",19
1498,3990779700,42,1704877176,1733136141,Yeritasardakan,1733131921,Medical University,"LINESTRING (44.52284 40.18702, 44.52302 40.187...",19
1846,3862942550,5,1924975351,1543189969,Hayk Nahapet,1543189990,Park named after Tatul Krpeyan,"LINESTRING (44.56486 40.20341, 44.56428 40.204...",19


In [11]:
routes_with_counts.to_file(os.path.join(base_dir, "routes_stops/full_routes_line_count.geojson"), driver='GeoJSON')

In [12]:
def match_stops(stops_gdf, max_distance=150):
    """
    Pairs nearby stops by finding the closest unmatched stop within a maximum distance threshold.

    Args:
        stops_gdf (GeoDataFrame): GeoDataFrame of transport stops with geometry.
        max_distance (int): Maximum distance in meters to consider a pair match.

    Returns:
        GeoDataFrame: A new GeoDataFrame including original stop names and geometries,
                      as well as matched pair names and geometries.
    """
    stops_proj = stops_gdf.to_crs(epsg=3857).copy()
    stops_proj["pair_name"] = None
    stops_proj["pair_geom"] = None
    used = set()

    for idx, stop in stops_proj.iterrows():
        if idx in used:
            continue

        buffer = stop.geometry.buffer(max_distance)
        remaining = stops_proj[~stops_proj.index.isin(used | {idx})]

        inside = remaining[remaining.geometry.within(buffer)]

        if not inside.empty:
            closest_idx = inside.index[0]
            used.update({idx, closest_idx})

            stop_name = stops_gdf.loc[idx, "stop_name"]
            stop_geom = stops_gdf.loc[idx, "geometry"]
            pair_name = stops_gdf.loc[closest_idx, "stop_name"]
            pair_geom = stops_gdf.loc[closest_idx, "geometry"]

            stops_proj.at[idx, "pair_name"] = pair_name
            stops_proj.at[idx, "pair_geom"] = pair_geom

            stops_proj.at[closest_idx, "pair_name"] = stop_name
            stops_proj.at[closest_idx, "pair_geom"] = stop_geom

    stops_proj["stop_geom"] = stops_gdf["geometry"]

    stops_proj = stops_proj.set_geometry("stop_geom")
    return stops_proj[["stop_id", "stop_name", "pair_name", "stop_geom", "pair_geom"]]

In [13]:
line29_stops = gpd.read_file(os.path.join(base_dir, "line_29/29_stops.geojson"))
line29_stops_pairs = match_stops(line29_stops)

In [14]:
line29_stops_pairs.to_file(os.path.join(base_dir, "routes_stops/line29_stops_pairs.geojson"), driver='GeoJSON')
line29_stops_pairs

Unnamed: 0,stop_id,stop_name,pair_name,stop_geom,pair_geom
0,5402422056,Frunze Dovlatyan Street,Frunze Dovlatyan Street,POINT (44.4831 40.13896),POINT (44.483104285 40.13895635)
1,5402420976,Nerkin Shengavit,Nerkin Shengavit,POINT (44.48089 40.13938),POINT (44.480760349 40.139194375)
2,4049882988,Shahamiryanner Street,Shahamiryanner Street,POINT (44.4829 40.1421),POINT (44.482915007 40.14219242)
3,4315747258,Taronts Street,Taronts Street,POINT (44.48423 40.14348),POINT (44.484097296 40.143472345)
4,4045345298,Taronts Street,Taronts Street,POINT (44.48548 40.14611),POINT (44.485410181 40.146172609)
...,...,...,...,...,...
83,4045345198,Taronts Street,Taronts Street,POINT (44.48541 40.14617),POINT (44.485479918 40.1461088)
84,4315747308,Taronts Street,Taronts Street,POINT (44.4841 40.14347),POINT (44.484231407 40.143476462)
85,4049882898,Shahamiryanner Street,Shahamiryanner Street,POINT (44.48292 40.14219),POINT (44.482896232 40.14209773)
86,4049878778,Nerkin Shengavit,Nerkin Shengavit,POINT (44.48076 40.13919),POINT (44.480885073 40.139384704)


In [15]:
line29_stops_pairs[line29_stops_pairs['pair_name'].isna()]

Unnamed: 0,stop_id,stop_name,pair_name,stop_geom,pair_geom
8,1787749186,Hayreniq,,POINT (44.49898 40.15277),
16,1788128356,Chess House,,POINT (44.52377 40.18021),
17,1985580091,Chamber music,,POINT (44.52431 40.18469),
19,1733131921,Medical University,,POINT (44.5248 40.18856),
21,1788095726,Bellagio,,POINT (44.5343 40.19333),
31,1543189843,Marshal Babajanyan / Almaty,,POINT (44.57774 40.2204),
35,1543189800,"Nver Safaryan Street, 14",,POINT (44.58432 40.21334),
51,1543189854,Marshal Babajanyan / Ashxabad,,POINT (44.57602 40.22047),
52,3457701755,"Marshal Babajanyan Street, 56",,POINT (44.57428 40.22041),
55,1543189960,Mental Health Center,,POINT (44.56719 40.21997),
