In [1]:
import requests
import re
from bs4 import BeautifulSoup
import json
import urllib
import pandas as pd
import geopandas as gpd
import shapely
from shapely.geometry import Point, LineString, MultiLineString, Polygon, MultiPolygon
from tqdm.notebook import tqdm
import time
from datetime import datetime

# Способ поиска всех маршрутов в городе по названию

In [2]:
def find_js(lnk):
    with requests.get(lnk, stream=True, timeout=25) as req:
        bs = BeautifulSoup(req.text, 'html.parser')
        all_scrpt = bs.find_all("script")
        for scr in all_scrpt:
            if scr.get('type') == "application/json":
                sc_ind = all_scrpt.index(scr)
                break
            else:
                sc_ind=0
        # 
        scrp_txt = all_scrpt[sc_ind].text
        js = json.loads(scrp_txt)

        return js

In [26]:
def get_bbox_n_border(place):

    str_qt = urllib.parse.quote(place)
    srch_ct = "https://yandex.ru/maps/?text={}".format(str_qt)

    js = find_js(srch_ct)
    item_one = js['searchPreloadedResults']['items'][0]
    
    found_place = item_one['address']
    print('Found place:',found_place)

    centr_point = item_one['coordinates']
    str_point = '?ll=' + str(centr_point[0]) + '%2C' + str(centr_point[1])
    
    try:
        found_geo = item_one['displayGeometry']['geometries']
        lst_poly=[]
        i=0
        for i in range(len(found_geo)):
            plgn = Polygon(found_geo[i]['coordinates'][0])
            lst_poly.append(plgn)
        # 
        gdf_poly = gpd.GeoDataFrame(geometry=lst_poly)
        gdf_poly.crs='epsg:4326'
    except:
        gdf_poly = gpd.GeoDataFrame()

    return str_point,gdf_poly

In [4]:
def get_url_from_city(place,city,str_point):

    str_qt = urllib.parse.quote(place + ', ' + city)
    srch_ct = "https://yandex.ru/maps/{}&mode=search&text={}".format(str_point,str_qt)

    req = requests.get(srch_ct)
    bs = BeautifulSoup(req.text, 'html.parser')


    alla = bs.find_all('a')
    for aa in alla:
        cls = aa.get("class")
        try:
            if 'catalog-entry-point' in cls:
                hrf = aa.get("href")
                break
        except:
            pass
    # 
    try:
        city_id,city_name = hrf.split("maps/")[1].split("/catalog")[0].split("/")
    except:
        city_id,city_name = None,None
        print("Error, not found {}".format(city))
        pass
    #

    lnk_tp = "https://yandex.ru/maps/"+ str(city_id) + "/" + str(city_name) + "/transport"

    return lnk_tp,city_id,city_name

In [5]:
def get_list_stop_ids(city_id,city_name):

    stop_url = "https://yandex.ru/maps/{}/{}/category/public_transport_stop/".format(city_id,
                                                                                    city_name)
    # 
    js_stp = find_js(stop_url)
    lst_items = js_stp['searchPreloadedResults']['items']

    lst_stop_ids = []
    for i in range(len(lst_items)):
        one_grp = lst_items[i]
        try:
            grp_stops = one_grp['stops']
            for j in range(len(grp_stops)):
                one_stop = grp_stops[j]
                grp = [one_stop['id'],one_stop['coordinates']]
                if grp not in lst_stop_ids:
                    lst_stop_ids.append(grp)
            #
        except:
            pass
    # 

    return lst_stop_ids 

In [6]:
def get_list_stop_ids2(city_id,city_name,pnt):

    stop_url = "https://yandex.ru/maps/{}/{}/category/public_transport_stop/{}&z=13"\
    .format(city_id,city_name,pnt)
    # 
    js_stp = find_js(stop_url)
    lst_items = js_stp['searchPreloadedResults']['items']

    lst_stop_ids = []
    for i in range(len(lst_items)):
        one_grp = lst_items[i]
        try:
            grp_stops = one_grp['stops']
            for j in range(len(grp_stops)):
                one_stop = grp_stops[j]
                grp = [one_stop['id'],one_stop['coordinates']]
                if grp not in lst_stop_ids:
                    lst_stop_ids.append(grp)
            #
        except:
            pass
    # 

    return lst_stop_ids 

In [7]:
def get_stop_url(lst_stop_ids,city_id,city_name):
    lst_stops_url=[]
    i=0
    for i in range(len(lst_stop_ids)):
        stop_id = lst_stop_ids[i][0]
        stop_url = 'https://yandex.ru/maps/{}/{}/stops/{}'.format(city_id,city_name,stop_id)
        lst_stops_url.append(stop_url)
    # 
    return lst_stops_url

In [8]:
def get_ids_all(js):
    try:
        lst_tsp = js['masstransitStop']['transports']
    except:
        lst_tsp = []
    lstno = ['suburban', 'water']
    lst_rts = []
    i=0
    for i in range(len(lst_tsp)):
        one_lst = []
        try:
            meta_veh = lst_tsp[i]
            lineid = meta_veh['lineId']
            if lineid not in one_lst:
                one_lst.append(lineid)
                answ_line = 1
            else:
                answ_line = 0
            try:
                transp_type = meta_veh['type']
                name_rt = meta_veh['name']
            except:
                transp_type,name_rt = None,None
            if answ_line == 1:
                one_lst.append(name_rt)
                one_lst.append(transp_type)
            #
            if (one_lst != []) & (one_lst not in lst_rts) & (one_lst[2] not in lstno):
                lst_rts.append(one_lst)
        except:
            pass
    # 

    return lst_rts

In [9]:
def get_rts_in_stop(lst_stops_url):
    big_lst_id = []
    i=0
    for i in range(len(lst_stops_url)):
        one_url=lst_stops_url[i]
        try:
            js = find_js(one_url)
        except:
            time.sleep(30)
            js = find_js(one_url)
            
        lst_rts = get_ids_all(js)
        j=0
        for j in range(len(lst_rts)):
            if lst_rts[j] not in big_lst_id:
                big_lst_id.append(lst_rts[j])
    # 
    return big_lst_id

In [10]:
def get_stop(dct):
    lst=[]
    stop_id = dct['id']
    name = dct['name']
    stop_lon = dct['coordinates'][0]
    stop_lat = dct['coordinates'][1]
    lst = [stop_id,name,stop_lat,stop_lon]
    return lst

In [11]:
def get_dir_data(one_dir,direction,lnkid,trip_id):
    all_stops=[]
    rt_line=[]
    line_data=[]
    i=0
    lst_stop=[]
    cnt_stp = 0
    line_data = line_data + [lnkid,trip_id,direction]
    for i in range(len(one_dir)):
        if 'id' in one_dir[i]:
            cnt_stp +=1
            lst_stop = []
            lst_stop = lst_stop + [lnkid,trip_id,direction,cnt_stp]
            lst_stop = lst_stop + get_stop(one_dir[i])
            all_stops.append(lst_stop)

        if 'points' in one_dir[i]:
            one_part = one_dir[i]['points']
            rt_line.append(LineString(one_part))
            
#             for j in range(len(one_part)):
#                 rt_line.append(one_part[j])
        #
    # 
    line_data.append(MultiLineString(rt_line))
    
    return all_stops,line_data

In [12]:
def get_rt_data(city_id, city_name, lnkid):
    
    lnk_rt = 'https://yandex.ru/maps/{}/{}/routes/{}'.format(city_id, city_name, lnkid)
    for_reestr = []
    line_data = []
    stop_seq =[]

    try:
        js_rt = find_js(lnk_rt)
        one_rt=js_rt['masstransitLine']['features']
        trpids = ''

        dirctn = 0
        for j in range(len(one_rt)):
            trip_id = one_rt[j]['properties']['ThreadMetaData']['id']
            trpids = trpids + str(trip_id) + ','
            stop_seq2,line_data2 = get_dir_data(one_rt[j]['features'],dirctn,lnkid,trip_id)
            stop_seq = stop_seq + stop_seq2
            if line_data2 not in line_data:
                line_data.append(line_data2)
            #
            if dirctn == 1:
                dirctn = 0
            else:
                dirctn = 1
        #

        if len(one_rt) > 1:
            is_circle = False
        else:
            is_circle = True
        #
        stp_frw = one_rt[0]['properties']['ThreadMetaData']['EssentialStops'][0]['name']
        try:
            stp_bckw = one_rt[0]['properties']['ThreadMetaData']['EssentialStops'][1]['name']
        except:
            stp_bckw = stp_frw
        #
        str_rt_nm = str(stp_frw) + "_" + str(stp_bckw)
        for_reestr.append(str_rt_nm)
        for_reestr.append(trpids[:-1])
        for_reestr.append(is_circle)
    except:
        print("Error in getting data from lineID:", lnkid)
        pass
    return for_reestr, stop_seq, line_data

In [13]:
def get_length(gdf_line,df_reestr):
    cp_gdfl = gdf_line.copy()
    cp_gdfl = cp_gdfl.to_crs('epsg:32637')
    lst_geo = list(cp_gdfl.geometry)

    lst_length = []
    for line in lst_geo:
        lngth = round(line.length/1000,2)
        lst_length.append(lngth)
    # 
    cp_gdfl['length'] = lst_length

    direct = cp_gdfl[cp_gdfl.direction == 0]
    # Удаление дубликатов!!! (для маршрута остается максимальная длина)
    direct = direct.sort_values(by=['line_id','length'], ascending=False)
    direct = direct.drop_duplicates(subset=['line_id'])
    backw = cp_gdfl[cp_gdfl.direction == 1]
    backw = backw.sort_values(by=['line_id','length'], ascending=False)
    backw = backw.drop_duplicates(subset=['line_id'])

    df_reestr = df_reestr.merge(direct[['line_id', 'length']], how='left',on=['line_id'])
    df_reestr['lenght_frw'] = df_reestr['length']
    del df_reestr['length']
    df_reestr = df_reestr.merge(backw[['line_id', 'length']], how='left',on=['line_id'])
    df_reestr['length_bckw'] = df_reestr['length']
    del df_reestr['length']
    df_reestr['length_bckw'] = df_reestr['length_bckw'].fillna(0)
    
    cp_gdfl = None
    del cp_gdfl
    
    return df_reestr

In [14]:
def make_df_from_url(city_id, city_name, lst_rts):

    big_reestr=[]
    big_seq=[]
    big_line=[]
    i=0
    for i in range(len(lst_rts)):
    # for i in range(2):
        lnkid = lst_rts[i][0]
        rt_nbr = lst_rts[i][1]
        rt_tp = lst_rts[i][2]
        for_r_2 = [lnkid,rt_nbr,rt_tp]
        for_reestr, stop_seq, line_data = get_rt_data(city_id, city_name, lnkid)
        for_r_2 = for_r_2 + for_reestr
        big_reestr.append(for_r_2)
        big_seq = big_seq + stop_seq
        big_line = big_line + line_data
    #

    # reestr with all routes
    columns=['line_id','route_number','type_ts','route_long_name','trip_id','is_circle']
    df_reestr = pd.DataFrame(data=big_reestr,columns=columns)

    # geometry of route
    gdf_line = gpd.GeoDataFrame(data=big_line,columns=['line_id','trip_id','direction','geometry'])
    gdf_line.crs='epsg:4326'

    # stop_sequence
    columns=['line_id','trip_id','direction','stop_sequence','stop_id','stop_name','lat','lon']
    df_stop_seq = pd.DataFrame(data=big_seq,columns=columns)


    # get line.length for reestr
    df_reestr = get_length(gdf_line,df_reestr)

    return df_reestr,gdf_line,df_stop_seq

In [15]:
def get_new_stops(df_stop_seq,lnk_tp,lst_stop_ids):
    lst_stops_all = list(df_stop_seq['stop_id'])
    lst_lon_all = list(df_stop_seq['lon'])
    lst_lat_all = list(df_stop_seq['lat'])
    lst_new_ids = []
    i=0
    for i in range(len(lst_stops_all)):
        stop_id = lst_stops_all[i]
        lon = lst_lon_all[i]
        lat = lst_lat_all[i]
        one_id = [stop_id,[lon,lat]]
        if one_id not in lst_new_ids:
            lst_new_ids.append(one_id)
    # 

    lst_points_all=[]
    lst_ids = []
    i=0
    for i in range(len(lst_new_ids)):
        point = Point(lst_new_ids[i][1])
        lst_points_all.append(point)
        lst_ids.append(lst_new_ids[i][0])
    # 

    gdf_points = gpd.GeoDataFrame(data=lst_new_ids,geometry=lst_points_all)
    gdf_points.crs='epsg:4326'
    gdf_points = gdf_points.to_crs('epsg:4326')


    #######
    #lnk_tp,city_id,city_name = get_url_from_city(city)
    js = find_js(lnk_tp)
    lst_bbox = js['mapRegion']['bounds']
    box1 = shapely.geometry.box(lst_bbox[0][0],lst_bbox[0][1],lst_bbox[1][0],lst_bbox[1][1])
    #######


    gdf_bbox = gpd.GeoDataFrame(geometry=[box1])
    gdf_bbox.crs='epsg:4326'
    gdf_bbox = gdf_bbox.to_crs('epsg:4326')

    gdf_pt_bbx = gpd.sjoin(gdf_points,gdf_bbox,op='within',
                           how='inner').drop("index_right",axis=1)
    #

    #######
    lst_new_stops_good = []
    l1=list(gdf_pt_bbx[0])
    l2=list(gdf_pt_bbx[1])
    i=0
    for i in range(len(l1)):
        one = [l1[i],l2[i]]
        if one not in lst_stop_ids:
            lst_new_stops_good.append(one)
    # 
    #######

    return lst_new_stops_good

In [16]:
def check_list_dupl(lst_dupl,lst_exmpl):

    lst_no_dupl = []
    i=0
    for i in range(len(lst_dupl)):
        if (lst_dupl[i] not in lst_no_dupl) & (lst_dupl[i] not in lst_exmpl):
            lst_no_dupl.append(lst_dupl[i])
    # 
    return lst_no_dupl

In [24]:
def get_city_transp_info(place, city):
    
    str_point,gdf_poly = get_bbox_n_border(city)
    
    lnk_tp,city_id,city_name = get_url_from_city(place,city,str_point)
    
    if city_id != None:
        # first list of stops
        lst_stop_ids = get_list_stop_ids(city_id,city_name)
    else:
        city_id = '11131'
        city_name = 'samara-oblast'
        lst_stop_ids = get_list_stop_ids2(city_id,city_name,str_point)
        lnk_tp = "https://yandex.ru/maps/"+ str(city_id) + "/" + str(city_name) + "/transport"
        
    #
    
    print("Part 1 start.")
    str_date = "{:%H:%M:%S}".format(datetime.now())
    print("Time start:",str_date)

    rts_min = round((len(lst_stop_ids) / 2.3 / 60),1)
    rts_max = round(rts_min*1.5,1)
    str_date = "{:%H:%M:%S}".format(datetime.now())
    print("Slowest part, time start:",str_date)
    print("Time to find routes on stops: {} to {} minutes".format(rts_min,rts_max))

    # get first list of stop urls
    lst_stops_url = get_stop_url(lst_stop_ids,city_id,city_name)

    # get first list of line_ids
    big_lst_id = get_rts_in_stop(lst_stops_url)

    # get first data
    df_reestr,gdf_line,df_stop_seq = make_df_from_url(city_id,city_name,big_lst_id)
    print("Part 1 end.")

    ####### second part ######
    print("Part 2 start.")
    time.sleep(2)
    #get second list of stops
    lst_stop_ids2 = get_new_stops(df_stop_seq,lnk_tp,lst_stop_ids)

    # get second list of stop urls
    lst_stops_url2 = get_stop_url(lst_stop_ids2,city_id,city_name)

    rts_min = round((len(lst_stop_ids2) / 2.3 / 60),1)
    rts_max = round(rts_min*1.5,1)
    str_date = "{:%H:%M:%S}".format(datetime.now())
    print("Slowest part, time start:",str_date)
    print("Time to find routes on stops: {} to {} minutes".format(rts_min,rts_max))

    # get second list of line_ids
    big_lst_id2 = get_rts_in_stop(lst_stops_url2)
    big_lst_id2 = check_list_dupl(big_lst_id2,big_lst_id)


    # get second data
    df_reestr2,gdf_line2,df_stop_seq2 = make_df_from_url(city_id,city_name,big_lst_id2)

    ##### finale #####
    df_reestr_all = df_reestr.append(df_reestr2).reset_index(drop=True)
    gdf_line_all = gdf_line.append(gdf_line2).reset_index(drop=True)
    df_stop_seq_all = df_stop_seq.append(df_stop_seq2).reset_index(drop=True)

    str_date = "{:%H:%M:%S}".format(datetime.now())
    print("Time end:",str_date)
#     else:
#         print('City not found')
#         df_reestr_all,gdf_line_all,df_stop_seq_all = None,None,None

    return df_reestr_all,gdf_line_all,df_stop_seq_all

In [25]:
def get_city_transp_info2(place, city):

    str_point,gdf_poly = get_bbox_n_border(city)

    city_id = '11131'
    city_name = 'samara-oblast'
    lst_stop_ids = get_list_stop_ids2(city_id,city_name,str_point)
    lnk_tp = "https://yandex.ru/maps/"+ str(city_id) + "/" + str(city_name) + "/transport"

    #

    print("Part 1 start.")
    str_date = "{:%H:%M:%S}".format(datetime.now())
    print("Time start:",str_date)

    rts_min = round((len(lst_stop_ids) / 2.3 / 60),1)
    rts_max = round(rts_min*1.5,1)
    str_date = "{:%H:%M:%S}".format(datetime.now())
    print("Slowest part, time start:",str_date)
    print("Time to find routes on stops: {} to {} minutes".format(rts_min,rts_max))

    # get first list of stop urls
    lst_stops_url = get_stop_url(lst_stop_ids,city_id,city_name)

    # get first list of line_ids
    big_lst_id = get_rts_in_stop(lst_stops_url)

    # get first data
    df_reestr_all,gdf_line_all,df_stop_seq_all = make_df_from_url(city_id,city_name,big_lst_id)
    print("Part 1 end.")

    return df_reestr_all,gdf_line_all,df_stop_seq_all

In [61]:
# city = 'Новокуйбышевск'
# df_reestr_all,gdf_line_all,df_stop_seq_all = get_city_transp_info(city)

Part 1 start.
Time start: 14:18:39
Slowest part, time start: 14:18:39
Time to find routes on stops: 0.5 to 0.8 minutes
Part 1 end.
Part 2 start.
Slowest part, time start: 14:19:30
Time to find routes on stops: 1.3 to 2.0 minutes
Time end: 14:20:51


In [19]:
lst_cities = ['Жигулёвск',
             'Кинель',
             'Нефтегорск',
             'Октябрьск',
             'Отрадный',
             'Похвистнево',
             'Сызрань',
             'Чапаевск',
             'Алексеевка',
             'Балашейка',
             'Безенчук',
             'Волжский',
             'Междуреченск',
             'Мирный',
             'Новосемейкино',
             'Осинки',
             'Петра Дубрава',
             'Рощинский',
             'Смышляевка',
             'Стройкерамика',
             'Суходол',
             'Усть-Кинельский']

In [29]:
lst_cities =['Алексеевка',
             'Волжский',
             'Междуреченск',
             'Мирный',
             'Рощинский']

In [31]:
lst_cities =['Осинки']

In [None]:
# total_reestr = pd.DataFrame()
# total_lines = gpd.GeoDataFrame()
# total_stops = pd.DataFrame()

i=0
# for i in tqdm(range(1)):
for i in tqdm(range(len(lst_cities))):
# for i in tqdm(range(1,len(lst_cities))):
    place = 'Самарская область ,'
    city1 = place + lst_cities[i]
    city = lst_cities[i]
    print()
    print(i,city)
#     if i < 8:
#         df_reestr_all,gdf_line_all,df_stop_seq_all = get_city_transp_info(place, city)
#     else:
#         df_reestr_all,gdf_line_all,df_stop_seq_all = get_city_transp_info2(place, city)
    #
    df_reestr_all,gdf_line_all,df_stop_seq_all = get_city_transp_info2(place, city1)
    try:
        ln = len(df_reestr_all)
    except:
        ln = 0
    if ln != 0:
#         total_reestr = total_reestr.append(df_reestr_all).reset_index(drop=True)
#         total_lines = total_lines.append(gdf_line_all).reset_index(drop=True)
#         total_stops = total_stops.append(df_stop_seq_all).reset_index(drop=True)
        
        df_reestr_all.to_csv("./data_SO/df_reestr_{}.csv".format(city),
                             encoding='utf-8-sig',sep=';',index=False)
        gdf_line_all.to_file("./data_SO/gdf_line_{}.json".format(city), 
                             driver="GeoJSON",encoding='utf-8-sig')
        df_stop_seq_all.to_csv("./data_SO/df_stop_seq_{}.csv".format(city),
                               encoding='utf-8-sig',sep=';',index=False)
        #
    #
#

In [None]:
city_id
city_name
pnt

In [None]:
# def get_list_stop_ids2(city_id,city_name,pnt):

stop_url = "https://yandex.ru/maps/{}/{}/category/public_transport_stop/{}&z=13"\
.format(city_id,city_name,pnt)
# 
js_stp = find_js(stop_url)
lst_items = js_stp['searchPreloadedResults']['items']

lst_stop_ids = []
for i in range(len(lst_items)):
    one_grp = lst_items[i]
    try:
        grp_stops = one_grp['stops']
        for j in range(len(grp_stops)):
            one_stop = grp_stops[j]
            grp = [one_stop['id'],one_stop['coordinates']]
            if grp not in lst_stop_ids:
                lst_stop_ids.append(grp)
        #
    except:
        pass
# 

# return lst_stop_ids 