In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
import numpy as np
import time

def get_address(id: int) -> tuple: # len 4
    try:
        resp = requests.get("https://www.saveecobot.com/en/station/{}".format(id))
    except requests.ConnectionError:
        print("\nConnection error, id: {} :(\n".format(id))
        time.sleep(10)
        return (None, None, None, None)
    except http.client.RemoteDisconnected:
        print("\nRemoteDisconnected, id: {} :(\n".format(id))
        time.sleep(10)
        return (None, None, None, None)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.text, 'html5lib')
        place = soup.select('li.breadcrumb-item')
        if len(place) == 4:
            return (place[1].get_text(), np.nan,              place[2].get_text(), place[3].get_text())
        elif len(place) == 5:
            return (place[1].get_text(), place[2].get_text(), place[3].get_text(), place[4].get_text())
    else:
        if resp.status_code != 404:
            print("\n{}: code {}\n".format(id, resp.status_code))
        return (np.nan, np.nan, np.nan, np.nan)

get_address(12939)

('Ukraine', nan, 'Kyiv', 'vulytsia Borshchahivska 13')

In [3]:
from tqdm import tqdm
from datetime import datetime
import http

def parse_saveecobot(id_from_inc: int, id_to_exc: int) -> pd.DataFrame:
    lst = [get_address(id) for id in tqdm(range(id_from_inc, id_to_exc))]
    return pd.DataFrame.from_records(lst, index=range(id_from_inc, id_to_exc),
        columns=['country', 'region', 'city', 'address'])

def iterative_parse_saveecobot():
    step = 100
    for i in range(6501, 16000, step):
        print(" Processing {}-{}...".format(i, i + step - 1))
        parse_saveecobot(i, i + step).to_csv(
            './saveecobot_{}-{}-{}.csv'.format(i, i + step - 1, datetime.now().strftime('%Y-%m-%dT%H-%M')))

iterative_parse_saveecobot()

  0%|          | 0/100 [00:00<?, ?it/s] Processing 6501-6600...
100%|██████████| 100/100 [00:46<00:00,  2.13it/s]
  0%|          | 0/100 [00:00<?, ?it/s] Processing 6601-6700...
100%|██████████| 100/100 [00:45<00:00,  2.20it/s]
  0%|          | 0/100 [00:00<?, ?it/s] Processing 6701-6800...
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
  0%|          | 0/100 [00:00<?, ?it/s] Processing 6801-6900...
100%|██████████| 100/100 [00:46<00:00,  2.15it/s]
  0%|          | 0/100 [00:00<?, ?it/s] Processing 6901-7000...
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
  0%|          | 0/100 [00:00<?, ?it/s] Processing 7001-7100...
100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
  0%|          | 0/100 [00:00<?, ?it/s] Processing 7101-7200...
100%|██████████| 100/100 [00:47<00:00,  2.09it/s]
  0%|          | 0/100 [00:00<?, ?it/s] Processing 7201-7300...
100%|██████████| 100/100 [00:48<00:00,  2.06it/s]
  0%|          | 0/100 [00:00<?, ?it/s] Processing 7301-7400...
100%|██████████| 100/100

KeyboardInterrupt: 

In [3]:
import pandas as pd

def read_points_dir(dir: str) -> pd.DataFrame:
    return pd.concat([pd.read_csv(os.path.join(dir, file), index_col=0) for file in os.listdir(dir)])

res = read_points_dir('all-points')
kyiv_ids = res[res.city == 'Kyiv'].index.tolist()
print(kyiv_ids)

[28, 30, 43, 47, 108, 109, 902, 914, 915, 968, 1004, 1041, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1083, 1098, 1105, 1107, 1115, 1126, 1127, 1128, 1152, 1160, 1161, 1193, 1208, 1274, 1278, 1290, 1294, 1306, 1330, 1331, 1353, 1372, 1373, 1376, 1387, 1394, 1415, 1424, 1425, 1437, 1516, 1540, 1544, 1549, 1553, 1556, 1651, 2741, 2762, 2781, 2792, 2806, 2809, 2827, 2867, 2878, 2903, 2924, 2925, 2947, 2973, 3016, 3018, 3070, 3084, 3111, 3112, 3231, 3239, 3306, 3396, 3398, 3401, 3422, 3432, 3483, 3492, 3497, 3498, 3504, 3514, 3535, 3541, 3547, 3572, 3573, 3583, 3600, 3601, 3603, 3619, 3620, 3621, 3622, 3641, 3652, 3653, 3658, 3665, 3669, 3678, 3682, 3684, 3685, 3686, 3687, 3688, 3689, 3690, 3692, 3693, 3694, 3702, 3709, 4042, 4053, 4063, 4068, 4139, 4169, 4191, 4193, 4201, 4211, 4219, 4230, 4232, 4252, 11785, 12829, 12905, 12907, 12909, 12910, 12911, 12912, 12914, 12917, 12918, 12920, 12921, 12923, 12924, 12925, 12926, 12927, 12928, 12929, 12930, 12931, 12932, 

In [20]:
res.region.unique()

array(['Dnipropetrovsk region', nan, 'Ivano-Frankivsk region',
       'Ternopil region', 'Kiev region', 'Rivne region', 'Odessa region',
       'Zaporozhye region', 'Lviv region', 'Donetsk region',
       'Kharkiv region', 'Tbilisi', 'Mtskheta-Mtianeti',
       'Khmelnytsky region', 'Vinnytsia region', 'Chernivtsi region',
       'Sumy region', 'Poltava region', 'Chernihiv region', 'California',
       'Kirovograd region', 'Volyn region', 'Zhytomyr region', 'Oregon',
       'District of Columbia', 'Colorado', 'Texas', 'Washington',
       'Indiana', 'New York', 'New Hampshire', 'Oklahoma', 'Florida',
       'Nevada', 'Arkansas', 'New Jersey', 'Massachusetts', 'Michigan',
       'Maryland', 'Virginia', 'Arizona', 'Transcarpathian region',
       'Lugansk region', 'Kherson region', 'Cherkasy region',
       'Vlaanderen', 'Bruxelles', 'Brussels Hoofdstedelijk Gewest',
       'Région Flamande', 'Wallonie', 'Flanders', 'Waals Gewest',
       'Imereti', 'Adjara', 'Kvemo Kartli', 'Flämische R

In [26]:
import numpy as np

print(len(res[res.country.notna() == True])) # non empty
print(len(res[res.country == 'Ukraine']))
print(len(res[res.city == 'Kyiv']))
print(len(res[res.region == 'Kiev region']) + len(res[res.city == 'Kiev region']))

12610
942
213
134


Unnamed: 0,country,region,city,address
12949,Ukraine,,Kiev region,vulytsia Shyroka 13
12950,Ukraine,,Kiev region,"vulytsia Urlivska, 23B"


Unnamed: 0,country,region,city,address
27,Ukraine,Kiev region,Sofiivska Borshchahivka,"vulytsia Soborna, 114"
76,Ukraine,Kiev region,Vasylkiv,"vulytsia Hoholia, 32"
77,Ukraine,Kiev region,Boryspil,"vulytsia Kyivskyi Shliakh ,72"
78,Ukraine,Kiev region,Bohuslav,"vulytsia Polova, 40"
79,Ukraine,Kiev region,Vyshhorod,"vulytsia Kyivska, 10B"
...,...,...,...,...
13662,Ukraine,Kiev region,Obukhiv,vulytsia Kashtanova 25
13786,Ukraine,Kiev region,Boryspil,"vulytsia Aivazovskoho, 8"
13791,Ukraine,Kiev region,Brovary,"vulytsia Anatoliia Lutsenka, 26"
13847,Ukraine,Kiev region,Vyshhorod,"vulytsia Kyivska, 8"


In [37]:
from datetime import datetime
import http
from tqdm import tqdm

def load_page(id: int):
    try:
        resp = requests.get("https://www.saveecobot.com/en/station/{}".format(id))
    except requests.ConnectionError:
        print("\nConnection error, id: {} :(\n".format(id))
        time.sleep(10)
    except http.client.RemoteDisconnected:
        print("\nRemoteDisconnected, id: {} :(\n".format(id))
        time.sleep(10)
    if resp.status_code == 200:
        f = open('./{}-{}.html'.format(id, datetime.now().strftime('%Y-%m-%dT%H-%M')), 'w')
        f.write(resp.text)
        f.close()
    else:
        print("\n{}: code {}\n".format(id, resp.status_code))

def load_kyiv_pages():
    for id in tqdm(kyiv_ids):
        load_page(id)

load_kyiv_pages()

100%|██████████| 213/213 [03:21<00:00,  1.05it/s]


In [22]:
import pandas as pd
from bs4 import BeautifulSoup
import re

def process_p(data: str) -> list:
    return re.sub(' {2,}', '', data).split('\n')

def filter_p(data: list) -> list:
    # remove trash: ''
    data = [i for i in data if ':' in i]
    # remove outdated stuff like:
    # Temperature: 13.6 °C <small>(30 September 2020, 13:48)</small>
    # (id: 30)
    for i in range(len(data) - 1):
        if (data[i + 1].startswith('(') and data[i + 1].endswith(')')):
            data[i + 1] = None
            data[i] = None
    return [i for i in data if i is not None]

def get_items(data: str):
    soup = BeautifulSoup(data, 'html5lib')
    raw = soup.select('div.col-md-6')
    p = raw[0].select('p')
    # registered items:
    items = len(p[-1].select('br')) + 1
    # outdated items:
    smalls = len(p[-1].select('small'))

    text = p[-1].get_text()
    lst = process_p(text)
    # print(lst)
    lst = filter_p(lst)
    if items - smalls != len(lst):
       print("Warning: registered {} items ({} outdated), got {}".format(items, smalls, len(lst)))
    return lst

def format_item(item: str):
    f_space = item.find(' ')
    l_space = item.rfind(' ')
    return item[(f_space + 1):l_space]

def to_df(data: str) -> pd.DataFrame:
    lst = get_items(data)
    lst = [format_item(i) for i in lst]
    lst = [tuple(i.split(': ')) for i in lst]
    return pd.DataFrame.from_records([dict(lst)])

# f = open('test-kyiv-12939.txt')
#f = open('archive-2020-11-02/30-2020-11-02T19-41.html')
f = open('archive-2020-11-02/968-2020-11-02T19-41.html')
data = f.read()
f.close()

#get_items(data)
to_df(data)

Unnamed: 0,PM2.5,PM10,Temperature,Relative humidity,Atmospheric pressure,Carbon dioxide (CO₂)
0,14.8,31.4,20.5,86.5,996.2,1355.02


In [23]:
def process_file(dir: str, file: str) -> pd.DataFrame:
    print("Processing {}...".format(file))
    f = open(os.path.join(dir, file))
    text = f.read()
    f.close()
    df = to_df(text)

    grps = re.search("(\d+)-", file)
    id = grps.group(1)
    df.insert(loc=0, column='id', value=id)
    return df

test_lst = ['28-2020-11-02T19-41.html', '915-2020-11-02T19-41.html', '1065-2020-11-02T19-41.html']
def process_achive(dir: str):
    return pd.concat([process_file(dir, file) for file in os.listdir(dir)]).set_index('id')

res = process_achive('archive-2020-11-02')

Processing 28-2020-11-02T19-41.html...
Processing 30-2020-11-02T19-41.html...
Processing 43-2020-11-02T19-41.html...
Processing 47-2020-11-02T19-41.html...
Processing 108-2020-11-02T19-41.html...
Processing 109-2020-11-02T19-41.html...
Processing 902-2020-11-02T19-41.html...
Processing 914-2020-11-02T19-41.html...
Processing 915-2020-11-02T19-41.html...
Processing 968-2020-11-02T19-41.html...
Processing 1004-2020-11-02T19-41.html...
Processing 1041-2020-11-02T19-41.html...
Processing 1061-2020-11-02T19-41.html...
Processing 1062-2020-11-02T19-41.html...
Processing 1063-2020-11-02T19-41.html...
Processing 1064-2020-11-02T19-41.html...
Processing 1065-2020-11-02T19-41.html...
Processing 1066-2020-11-02T19-41.html...
Processing 1067-2020-11-02T19-41.html...
Processing 1068-2020-11-02T19-41.html...
Processing 1069-2020-11-02T19-41.html...
Processing 1070-2020-11-02T19-41.html...
Processing 1071-2020-11-02T19-41.html...
Processing 1072-2020-11-02T19-41.html...
Processing 1083-2020-11-02T19-

In [74]:
res

Unnamed: 0_level_0,PM2.5,PM10,Temperature,Relative humidity,Atmospheric pressure,Carbon dioxide (CO₂),HECA – Temperature,HECA – Relative Humidity,Nitrogen dioxide (NO₂),Carbon monoxide (CO),Formaldehyde (CH₂O),PM1,Ozone (O₃),Sulfur dioxide (SO₂)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
28,8.6,13.9,8,99.9,,,,,,,,,,
30,15.6,20.1,13.6,99.9,,,,,,,,,,
43,11.5,23.1,7.8,99.9,,,,,,,,,,
47,15.5,50.1,6.6,99.9,,,,,,,,,,
108,3.9,5.7,8.3,99.9,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13802,25,28.3,8.7,100,1008.1,,,,,,,17.7,,
13803,25,25,9.7,100,1000.9,,,,,,,15,,
13811,30,34.2,7.3,100,1006.7,,,,,,,19,,
13820,11.6,17.6,12.7,59,1002.2,,,,,,,,,


In [72]:
print(res.iloc[:, 3].notnull())
print(res.iloc[:, 3].notnull().sum())

id
28        True
30        True
43        True
47        True
108       True
         ...  
13802     True
13803     True
13811     True
13820     True
13853    False
Name: Relative humidity, Length: 213, dtype: bool
206


In [73]:
res[(res.iloc[:, 7].notnull() == True) & (res.iloc[:, 3].notnull() == True)]

Unnamed: 0_level_0,PM2.5,PM10,Temperature,Relative humidity,Atmospheric pressure,Carbon dioxide (CO₂),HECA – Temperature,HECA – Relative Humidity,Nitrogen dioxide (NO₂),Carbon monoxide (CO),Formaldehyde (CH₂O),PM1,Ozone (O₃),Sulfur dioxide (SO₂)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1004,5.0,8.5,10.9,100.0,1006.0,,-128.0,-1.0,,,,,,
1274,8.0,14.3,10.8,62.0,1003.2,,11.93,62.58,,,,,,
1278,11.1,15.9,9.3,100.0,1007.5,,12.96,61.95,,,,,,
1294,11.3,18.8,6.7,100.0,1009.0,,12.95,63.99,,,,,,
1306,7.3,8.6,12.9,60.5,1007.3,,13.3,59.44,,,,,,
1372,8.2,11.1,7.3,97.0,990.5,,12.34,60.58,,,,,,
1373,5.8,8.5,-142.4,100.0,,,19.86,56.35,,,,,,
1376,14.2,29.5,5.5,73.4,1014.0,,9.31,60.65,,,,,,
1387,16.4,23.4,10.8,88.0,996.9,,12.32,77.94,,,,,,
1394,17.0,28.1,8.2,79.6,1007.4,,14.43,58.35,,,,,,


In [24]:
res.to_csv('agg-2020-11-02-kyiv.csv')

In [1]:
f = open('archive-2020-11-02/30-2020-11-02T19-41.html')
data = f.read()
f.close()

In [20]:
# get coordinates

from bs4 import BeautifulSoup
import re
import pandas as pd

def get_coordinates(data: str):
    soup = BeautifulSoup(data, 'html5lib')
    col_map = soup.select('div.col-maps')[0]
    col_map_text = col_map.text
        # col_map_text example:
        #     seb.mapOptions = {
        #     center: [50.4340, 30.4320],
        #     zoom : 16,
        #                         fullMode: false
        # };
        # seb.deviceId = '30';
    grps = re.search("\[(.*), (.*)\]", col_map_text)
    # print(grps.group(1))+
    row = {'x': [grps.group(1)], 'y': [grps.group(2)]}
    return pd.DataFrame.from_dict(row)
    # p = raw[0].select('p')

def process_file_coordinates(dir: str, file: str) -> pd.DataFrame:
    print("Processing {}...".format(file))
    f = open(os.path.join(dir, file))
    text = f.read()
    f.close()
    df = get_coordinates(text)

    grps = re.search("(\d+)-", file)
    id = grps.group(1)
    df.insert(loc=0, column='id', value=id)
    return df

# def insert_id(df: pd.DataFrame, id):
#     df.insert(loc=0, column='id', value=id, inplace=True)

test_lst = ['28-2020-11-02T19-41.html', '915-2020-11-02T19-41.html', '1065-2020-11-02T19-41.html']
def process_achive_coordinates(dir: str): 
    return pd.concat([process_file_coordinates(dir, file) for file in os.listdir(dir)]).set_index('id') # in os.listdir(dir)

res = process_achive_coordinates('archive-2020-11-02')
res

Processing 28-2020-11-02T19-41.html...
Processing 30-2020-11-02T19-41.html...
Processing 43-2020-11-02T19-41.html...
Processing 47-2020-11-02T19-41.html...
Processing 108-2020-11-02T19-41.html...
Processing 109-2020-11-02T19-41.html...
Processing 902-2020-11-02T19-41.html...
Processing 914-2020-11-02T19-41.html...
Processing 915-2020-11-02T19-41.html...
Processing 968-2020-11-02T19-41.html...
Processing 1004-2020-11-02T19-41.html...
Processing 1041-2020-11-02T19-41.html...
Processing 1061-2020-11-02T19-41.html...
Processing 1062-2020-11-02T19-41.html...
Processing 1063-2020-11-02T19-41.html...
Processing 1064-2020-11-02T19-41.html...
Processing 1065-2020-11-02T19-41.html...
Processing 1066-2020-11-02T19-41.html...
Processing 1067-2020-11-02T19-41.html...
Processing 1068-2020-11-02T19-41.html...
Processing 1069-2020-11-02T19-41.html...
Processing 1070-2020-11-02T19-41.html...
Processing 1071-2020-11-02T19-41.html...
Processing 1072-2020-11-02T19-41.html...
Processing 1083-2020-11-02T19-

Unnamed: 0_level_0,x,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1
28,50.4440,30.5400
30,50.4340,30.4320
43,50.411719,30.618949
47,50.472938,30.508250
108,50.362592,30.442744
...,...,...
13802,50.4935132,30.5062436
13803,50.4373968,30.5956655
13811,50.4128801,30.6078559
13820,50.49908358179,30.57750999928


In [22]:
res.to_excel("coordinates.xlsx")