In [1]:
import requests
import re
from bs4 import BeautifulSoup
import json
import urllib
import pandas as pd
import geopandas as gpd
import shapely
from shapely.geometry import Point, LineString, MultiLineString
from tqdm.notebook import tqdm
import time
from datetime import datetime

In [2]:
def find_js(lnk):
    with requests.get(lnk, stream=True, timeout=25) as req:
        bs = BeautifulSoup(req.text, 'html.parser')
        all_scrpt = bs.find_all("script")
        for scr in all_scrpt:
            if scr.get('type') == "application/json":
                sc_ind = all_scrpt.index(scr)
                break
            else:
                sc_ind=0
        # 
        scrp_txt = all_scrpt[sc_ind].text
        js = json.loads(scrp_txt)

        return js

In [3]:
def get_stop_url(lst_stop_ids,city_id,city_name):
    lst_stops_url=[]
    i=0
    for i in range(len(lst_stop_ids)):
        stop_id = lst_stop_ids[i][0]
        stop_url = 'https://yandex.ru/maps/{}/{}/stops/{}'.format(city_id,city_name,stop_id)
        lst_stops_url.append(stop_url)
    # 
    return lst_stops_url

In [4]:
def get_ids_all(js):
    try:
        lst_tsp = js['masstransitStop']['transports']
    except:
        lst_tsp = []
#     lstno = ['suburban', 'water']
    lst_rts = []
    i=0
    for i in range(len(lst_tsp)):
        one_lst = []
        try:
            meta_veh = lst_tsp[i]
            lineid = meta_veh['lineId']
            if lineid not in one_lst:
                one_lst.append(lineid)
                answ_line = 1
            else:
                answ_line = 0
            try:
                transp_type = meta_veh['type']
                name_rt = meta_veh['name']
            except:
                transp_type,name_rt = None,None
            if answ_line == 1:
                one_lst.append(name_rt)
                one_lst.append(transp_type)
            #
            if (one_lst != []) & (one_lst not in lst_rts):
                lst_rts.append(one_lst)
        except:
            pass
    # 

    return lst_rts

In [5]:
def get_rts_in_stop(lst_stops_url):
    big_lst_id = []
    i=0
    for i in range(len(lst_stops_url)):
        one_url=lst_stops_url[i]
        js = find_js(one_url)
        lst_rts = get_ids_all(js)
        j=0
        for j in range(len(lst_rts)):
            if lst_rts[j] not in big_lst_id:
                big_lst_id.append(lst_rts[j])
    # 
    return big_lst_id

In [6]:
def get_stop(dct):
    lst=[]
    stop_id = dct['id']
    name = dct['name']
    stop_lon = dct['coordinates'][0]
    stop_lat = dct['coordinates'][1]
    lst = [stop_id,name,stop_lat,stop_lon]
    return lst

In [7]:
def get_dir_data(one_dir,direction,lnkid):
    all_stops=[]
    rt_line=[]
    line_data=[]
    i=0
    lst_stop=[]
    cnt_stp = 0
    line_data = line_data + [lnkid,direction]
    for i in range(len(one_dir)):
        if 'id' in one_dir[i]:
            cnt_stp +=1
            lst_stop = []
            lst_stop = lst_stop + [lnkid,direction,cnt_stp]
            lst_stop = lst_stop + get_stop(one_dir[i])
            all_stops.append(lst_stop)

        if 'points' in one_dir[i]:
            one_part = one_dir[i]['points']
            for j in range(len(one_part)):
                rt_line.append(one_part[j])
        #
    # 
    line_data.append(LineString(rt_line))
    
    return all_stops,line_data

In [8]:
def get_rt_data(city_id, city_name, lnkid):
    
    lnk_rt = 'https://yandex.ru/maps/{}/{}/routes/{}'.format(city_id, city_name, lnkid)
    for_reestr = []
    line_data = []
    stop_seq =[]

    try:
        js_rt = find_js(lnk_rt)
        one_rt=js_rt['masstransitLine']['features']


        stop_seq1,line_data1 = get_dir_data(one_rt[0]['features'],0,lnkid)
        line_data.append(line_data1)

        stp_frw = one_rt[0]['properties']['ThreadMetaData']['EssentialStops'][0]['name']
        try:
            stp_bckw = one_rt[0]['properties']['ThreadMetaData']['EssentialStops'][1]['name']
        except:
            stp_bckw = stp_frw
        #
        str_rt_nm = str(stp_frw) + "_" + str(stp_bckw)
        for_reestr.append(str_rt_nm)

        if len(one_rt) == 2:
            stop_seq2,line_data2 = get_dir_data(one_rt[1]['features'],1,lnkid)
            stop_seq = stop_seq1 + stop_seq2
            if line_data2 not in line_data:
                line_data.append(line_data2)
            #
            is_circle = False

        else:
            stop_seq = stop_seq1
            is_circle = True

        # 

        for_reestr.append(is_circle)
    except:
        print("Error in getting data from lineID:", lnkid)
        pass
    return for_reestr, stop_seq, line_data

In [9]:
def get_length(gdf_line,df_reestr):
    cp_gdfl = gdf_line.copy()
    cp_gdfl = cp_gdfl.to_crs('epsg:32637')
    lst_geo = list(cp_gdfl.geometry)

    lst_length = []
    for line in lst_geo:
        lngth = round(line.length/1000,2)
        lst_length.append(lngth)
    # 
    cp_gdfl['length'] = lst_length

    direct = cp_gdfl[cp_gdfl.direction == 0]
    backw = cp_gdfl[cp_gdfl.direction == 1]

    df_reestr = df_reestr.merge(direct[['line_id', 'length']], how='left',on=['line_id'])
    df_reestr['lenght_frw'] = df_reestr['length']
    del df_reestr['length']
    df_reestr = df_reestr.merge(backw[['line_id', 'length']], how='left',on=['line_id'])
    df_reestr['length_bckw'] = df_reestr['length']
    del df_reestr['length']
    df_reestr['length_bckw'] = df_reestr['length_bckw'].fillna(0)
    
    cp_gdfl = None
    del cp_gdfl
    
    return df_reestr

In [10]:
def make_df_from_url(city_id, city_name, lst_rts):

    big_reestr=[]
    big_seq=[]
    big_line=[]
    i=0
    for i in range(len(lst_rts)):
    # for i in range(2):
        lnkid = lst_rts[i][0]
        rt_nbr = lst_rts[i][1]
        rt_tp = lst_rts[i][2]
        for_r_2 = [lnkid,rt_nbr,rt_tp]
        for_reestr, stop_seq, line_data = get_rt_data(city_id, city_name, lnkid)
        for_r_2 = for_r_2 + for_reestr
        big_reestr.append(for_r_2)
        big_seq = big_seq + stop_seq
        big_line = big_line + line_data
    #

    # reestr with all routes
    columns=['line_id','route_number','type_ts','route_long_name','is_circle']
    df_reestr = pd.DataFrame(data=big_reestr,columns=columns)

    # geometry of route
    gdf_line = gpd.GeoDataFrame(data=big_line,columns=['line_id','direction','geometry'])
    gdf_line.crs='epsg:4326'

    # stop_sequence
    columns=['line_id','direction','stop_sequence','stop_id','stop_name','lat','lon']
    df_stop_seq = pd.DataFrame(data=big_seq,columns=columns)


    # get line.length for reestr
    df_reestr = get_length(gdf_line,df_reestr)

    return df_reestr,gdf_line,df_stop_seq

In [30]:
data_stops2 = pd.read_csv("water_only.csv",sep=';',encoding='windows-1251')

In [31]:
data_stops2.head(3)

Unnamed: 0,city_id,city_name,stop_id,stop_type,stop_name,url
0,51,samara,1543171260,причал,1-й причал,https://yandex.ru/maps/51/samara/stops/1543171...
1,51,samara,1829520131,причал,База отдыха Ладья,https://yandex.ru/maps/51/samara/stops/1829520...
2,51,samara,1543170860,причал,База отдыха Чайка,https://yandex.ru/maps/51/samara/stops/1543170...


In [41]:
data_stops = data_stops2[data_stops2.stop_type == 'жд']
print(len(data_stops))

4


In [42]:
# i=0
lst_ct_ids = list(data_stops.city_id.unique())

df_reestr_all = pd.DataFrame()
gdf_line_all = gpd.GeoDataFrame()
df_stop_seq_all = pd.DataFrame()

for ct_id in tqdm(lst_ct_ids):
    one_ct = data_stops[data_stops.city_id == ct_id].reset_index(drop=True)
    ct_name = one_ct.city_name[0]
    lst_stops_url = list(one_ct.url)
    big_lst_id = get_rts_in_stop(lst_stops_url)
    df_reestr,gdf_line,df_stop_seq = make_df_from_url(ct_id,ct_name,big_lst_id)
    
    df_reestr_all = df_reestr_all.append(df_reestr).reset_index(drop=True)
    gdf_line_all = gdf_line_all.append(gdf_line).reset_index(drop=True)
    df_stop_seq_all = df_stop_seq_all.append(df_stop_seq).reset_index(drop=True)
# 

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [43]:
df_reestr_all = df_reestr_all.drop_duplicates(subset=['line_id']).reset_index(drop=True)
gdf_line_all = gdf_line_all.drop_duplicates(subset=['line_id',
                                                    'direction']).reset_index(drop=True)
df_stop_seq_all = df_stop_seq_all.drop_duplicates(subset=['line_id',
                                                          'direction',
                                                          'stop_sequence']).reset_index(drop=True)
# 

In [44]:
city = "СТН"
tp = "rail"

In [45]:
df_reestr_all.to_csv("df_reestr_{}_{}.csv".format(city,tp),
                     encoding='utf-8-sig',sep=';',index=False)
gdf_line_all.to_file("gdf_line_{}_{}.json".format(city,tp), 
                     driver="GeoJSON",encoding='utf-8-sig')
df_stop_seq_all.to_csv("df_stop_seq_{}_{}.csv".format(city,tp),
                       encoding='utf-8-sig',sep=';',index=False)