In [76]:
# import libraries
import urllib3
import urllib.parse
from bs4 import BeautifulSoup
import csv
import re

In [90]:
base_url = 'http://www.uz.gov.ua/passengers/timetable/'
station_url_pre = '?station='
station_url_post = '&by_station='+ urllib.parse.quote('Пошук', safe='')
train_url_pre = '?ntrain='
train_url_post = '&by_id=1'
stations = [['47548,23092,23081,23215,36921,23200', 'Львів'], ['23020', 'Ковель'], 
            ['23300', 'Тернопіль'], ['23600,23629', 'Одеса'], ['22423', 'Шевченкове'], 
            ['23390', 'Знам\'янка'], ['23536', 'Миколаїв'], ['22700', 'Дніпро'], 
            ['22680,22900,22840,22747', 'Кривий Ріг'], ['22730,22800', 'Запоріжжя'], 
            ['739,47125,22080,47140,47175,22298,22000,47190', 'Київ'], ['22070', 'Ніжин'], 
            ['22270', 'Жмеринка'], ['22302', 'Козятин-1'], ['22171', 'Козятин-2'], 
            ['22100,22446,22410', 'Харків'], ['22430', 'Лозова'], 
            ['22580,22590', 'Полтава'], ['22450', 'Суми']]

In [91]:
def processStation(station_id, station_name):
    print('Processing station: '+station_name+' - start')
    http = urllib3.PoolManager()
    station_url_full = base_url + station_url_pre + station_id + station_url_post
    print('GET url: '+station_url_full)
    request = http.request('GET', station_url_full)
    soup = BeautifulSoup(request.data, 'html5lib')
    
    trains = []
    trains_div = soup.find('div', attrs={'class':'c-trains-by-station'})
    trains_table = trains_div.find('table')
    trains_table_body = trains_table.find('tbody')
    train_rows = trains_table_body.find_all('tr')
    for row in train_rows:
        cols = row.find_all('td')
        a = cols[0].find_all('a', href=True)
        match = re.match('\?ntrain=(.+)&by_id=1', a[0]['href'])
        train_id = match.group(1)
        train_name = a[0].text.strip()
        trains.append([train_id, train_name])
    print(len(trains),' trains parsed')
    csv_file_name = 'node_stations\'+station_name+'.csv'
    with open(csv_file_name, 'w', newline='', encoding = "utf-8") as csv_file:        
        writer = csv.writer(csv_file)
        for row in trains:
            writer.writerow(row)
    print('Trains table saved to file: '+csv_file_name)
    print('Processing station: '+station_name+' - done')
    return trains

In [93]:
all_trains = []

for station in stations:
    trains = processStation(station[0], station[1])
    all_trains.extend(trains)

Processing station: Львів - start
GET url: http://www.uz.gov.ua/passengers/timetable/?station=47548,23092,23081,23215,36921,23200&by_station=%D0%9F%D0%BE%D1%88%D1%83%D0%BA
214  trains parsed
Trains table saved to file: Львів.csv
Processing station: Львів - done
Processing station: Ковель - start
GET url: http://www.uz.gov.ua/passengers/timetable/?station=23020&by_station=%D0%9F%D0%BE%D1%88%D1%83%D0%BA
26  trains parsed
Trains table saved to file: Ковель.csv
Processing station: Ковель - done
Processing station: Тернопіль - start
GET url: http://www.uz.gov.ua/passengers/timetable/?station=23300&by_station=%D0%9F%D0%BE%D1%88%D1%83%D0%BA
84  trains parsed
Trains table saved to file: Тернопіль.csv
Processing station: Тернопіль - done
Processing station: Одеса - start
GET url: http://www.uz.gov.ua/passengers/timetable/?station=23600,23629&by_station=%D0%9F%D0%BE%D1%88%D1%83%D0%BA
90  trains parsed
Trains table saved to file: Одеса.csv
Processing station: Одеса - done
Processing station: Шевч

In [96]:
dup_n = 0
all_trains_dict = {}
for train in all_trains:
    key = train[0].strip()
    value = train[1].strip()
    if key in all_trains_dict:
        dup_n = dup_n + 1
    else:
      all_trains_dict[key] = value

print(dup_n, ' duplications of trains in node stations, ', len(all_trains_dict.keys()), 'unique trains')

1219  duplications of trins in node stations,  481 unique trains


All trains table saved to file: trains_all.csv


In [107]:
def processTrain(train_id, train_name):
    print('Processing train: '+train_name+' - start')
    http = urllib3.PoolManager()
    train_url_full = base_url + train_url_pre + train_id + train_url_post
    print('GET url: '+train_url_full)
    request = http.request('GET', train_url_full)
    soup = BeautifulSoup(request.data, 'html5lib')
    
    train_div = soup.find('div', attrs={'class':'c-trains-by-number'})
    train_table = train_div.find_all('table')[0]
    cols = train_table.find_all('td')
    train = [train_id,cols[1].text.strip(),cols[0].text.strip(),cols[2].text.strip(),cols[3].text.strip()]
    
    stops = []
    stops_table = train_div.find_all('table')[1]    
    stops_table_body = stops_table.find('tbody')
    stops_rows = stops_table_body.find_all('tr')
    for row in stops_rows:
        cols = row.find_all('td')
        
        a = cols[0].find_all('a', href=True)
        match = re.match('\?station=(.+)&by_station=1', a[0]['href'])
        station_id = match.group(1)
        station_name = a[0].text.strip()
        time_arrive = cols[1].text.strip()
        time_departure = cols[2].text.strip()
        stops.append([station_id, station_name, time_arrive, time_departure])
    print(len(trains),' trains parsed')
    csv_file_name = 'trains/'+train_id+'.csv'
    with open(csv_file_name, 'w', newline='', encoding = "utf-8") as csv_file:        
        writer = csv.writer(csv_file)
        for row in stops:
            writer.writerow(row)
    print('Trains table saved to file: '+csv_file_name)
    print('Processing train: '+train_name+' - done')
    return train, stops

In [112]:
all_trains = []
all_stops_dict = {}
for key in all_trains_dict.keys():
    train, stops = processTrain(key, all_trains_dict[key])
    all_trains.append(train)
    all_stops_dict[train[0]] = stops
    

Processing train: 43 - start
GET url: http://www.uz.gov.ua/passengers/timetable/?ntrain=56927&by_id=1
43  trains parsed
Trains table saved to file: trains/56927.csv
Processing train: 43 - done
Processing train: 133 - start
GET url: http://www.uz.gov.ua/passengers/timetable/?ntrain=1668&by_id=1
43  trains parsed
Trains table saved to file: trains/1668.csv
Processing train: 133 - done
Processing train: 250 - start
GET url: http://www.uz.gov.ua/passengers/timetable/?ntrain=57110&by_id=1
43  trains parsed
Trains table saved to file: trains/57110.csv
Processing train: 250 - done
Processing train: 749  (ІНТЕРСІТІ) - start
GET url: http://www.uz.gov.ua/passengers/timetable/?ntrain=54178&by_id=1
43  trains parsed
Trains table saved to file: trains/54178.csv
Processing train: 749  (ІНТЕРСІТІ) - done
Processing train: 7 - start
GET url: http://www.uz.gov.ua/passengers/timetable/?ntrain=429&by_id=1
43  trains parsed
Trains table saved to file: trains/429.csv
Processing train: 7 - done
Processing 

In [113]:
csv_file_name = 'trains_all.csv'
with open(csv_file_name, 'w', newline='', encoding = "utf-8") as csv_file:        
    writer = csv.writer(csv_file)
    writer.writerows(all_trains)
    print('All trains table saved to file: '+csv_file_name)

All trains table saved to file: trains_all.csv


In [114]:
stations_dup_n = 0
all_stops_list = []
all_stations_dict = {}
for key in all_stops_dict.keys():
    for stop in all_stops_dict[key]:
        station_id = stop[0]
        station_name = stop[1]
        all_stops_list.append([key, station_id, stop[2], stop[3]])
        if station_id in all_stations_dict:
            stations_dup_n = stations_dup_n + 1
        else:
            all_stations_dict[station_id] = station_name
print(stations_dup_n, ' duplications of stations in all trains, ', 
      len(all_stations_dict.keys()), 'unique stations')

6872  duplications of stations in all trains,  810 unique stations


In [115]:
csv_file_name = 'stations_all.csv'
with open(csv_file_name, 'w', newline='', encoding = "utf-8") as csv_file:        
    writer = csv.writer(csv_file)
    for key in all_stations_dict:
        writer.writerow([key, all_stations_dict[key]])
    print('All stations table saved to file: '+csv_file_name)

All stations table saved to file: stations_all.csv


In [116]:
csv_file_name = 'stops_all.csv'
with open(csv_file_name, 'w', newline='', encoding = "utf-8") as csv_file:        
    writer = csv.writer(csv_file)
    writer.writerows(all_stops_list)
    print('All stops table saved to file: '+csv_file_name)

All stops table saved to file: stops_all.csv
