# The Initial notebook

In [76]:
! wget https://storage.yandexcloud.net/invitro/invitro_train.zip
! unzip invitro_train.zip

--2021-06-26 05:16:54--  https://storage.yandexcloud.net/invitro/invitro_train.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1564644 (1.5M) [application/zip]
Saving to: ‘invitro_train.zip’


2021-06-26 05:16:55 (258 MB/s) - ‘invitro_train.zip’ saved [1564644/1564644]

Archive:  invitro_train.zip
  inflating: invitro_train.csv       


In [754]:
import numpy as np
import pandas as pd
import requests
import json
import re
from sklearn.metrics import mean_absolute_error
from tqdm.notebook import tqdm
from operator import itemgetter 
from dadata import Dadata

In [231]:
%pip install dadata

Defaulting to user installation because normal site-packages is not writeable
Collecting dadata
  Downloading dadata-20.7.0-py3-none-any.whl (10 kB)
Collecting httpx
  Downloading httpx-0.18.2-py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 1.5 MB/s 
[?25hCollecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting httpcore<0.14.0,>=0.13.3
  Downloading httpcore-0.13.6-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 2.4 MB/s 
[?25hCollecting rfc3986[idna2008]<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting anyio==3.*
  Downloading anyio-3.2.1-py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 2.0 MB/s 
[?25hCollecting h11<0.13,>=0.11
  Downloading h11-0.12.0-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 1.9 MB/s 
Installing collected packages: sniffio, rfc3986, h11, anyio, httpcore, httpx, dadata
Successfully installed anyio-3.2.1 d

In [78]:
df = pd.read_csv('invitro_train.csv')
df['Дата взятия'] = pd.to_datetime(df['Дата взятия'])
df

Unnamed: 0,Имя теста,Код теста,Значение,Регион,Дата взятия
0,Borrelia burgdorferi s.l (кач) ДНК,27Д,ОБНАРУЖ.,Челябинская обл,2019-05-25 06:02:00
1,Borrelia burgdorferi s.l (кач) ДНК,27Д,ОБНАРУЖ.,Челябинская обл,2019-05-27 09:04:00
2,Вирус клещевого энцефалита(кач) РНК,35Д,НЕ ОБНАР,Челябинская обл,2019-05-27 09:04:00
3,Borrelia burgdorferi s.l (кач) ДНК,27Д,ОБНАРУЖ.,Челябинская обл,2019-05-27 09:14:00
4,Вирус клещевого энцефалита(кач) РНК,35Д,НЕ ОБНАР,Челябинская обл,2019-05-27 09:14:00
...,...,...,...,...,...
308540,Вирус клещевого энцефалита(кач) РНК,35Д,НЕ ОБНАР,Воронежская обл,2021-05-16 09:37:00
308541,Borrelia burgdorferi s.l (кач) ДНК,27Д,НЕ ОБНАР,Москва,2021-05-16 09:26:00
308542,Вирус клещевого энцефалита(кач) РНК,35Д,НЕ ОБНАР,Москва,2021-05-16 09:26:00
308543,Borrelia burgdorferi s.l (кач) ДНК,27Д,НЕ ОБНАР,Москва,2021-05-17 12:00:00


In [79]:
df['Дата взятия'].min(), df['Дата взятия'].max()

(Timestamp('2019-02-13 07:37:00'), Timestamp('2021-06-08 23:01:00'))

In [176]:
df['Значение'].unique()

array(['ОБНАРУЖ.', 'НЕ ОБНАР', 'Б/П', 'ОБНАРУЖ', 'Is not detected',
       'СМ.КОММ.', 'СМ. КОММ', 'Detected'], dtype=object)

In [106]:
def baseline(df):
    df['date_day'] = df['Дата взятия'].dt.round('1d')
    df['is_pos'] = (df['Значение'] != 'НЕ ОБНАР').astype(int)
    train = df.groupby(['Регион', 'Имя теста', 'date_day'])[['is_pos']].agg(['sum', 'count'])
    test_names = sorted(df['Имя теста'].unique())
    test_names_count = [f'{t} count' for t in test_names]
    test_names_pos_perc = [f'{t} pos perc' for t in test_names]
    
    regions = sorted(df['Регион'].unique())
    
    # test date range 
    date = pd.date_range('2021.06.09', '2021.06.27', freq='1d')
    
    submission = pd.DataFrame()
    
    for region in regions:
        df_reg_test = pd.DataFrame(index=date)
        df_reg_test['region'] = region # !! fixed region order
        for t, t_count, t_pos in zip(test_names, test_names_count, test_names_pos_perc):
            # add daily mean by region
            try:
                df_reg_test[t_count] = train.loc[(region, t), ('is_pos', 'count')].mean()
                df_reg_test[t_pos] = (train.loc[(region, t), ('is_pos', 'sum')].mean() / df_reg_test[t_count] * 100)
            except:
                df_reg_test[t_count] = 0
                df_reg_test[t_pos] = 0
        submission = submission.append(df_reg_test)
    return submission.round()


def validate_submission(submission):
    if isinstance(submission, str):
        submission = pd.read_csv(submission, index_col=0)
        
    assert submission.shape == (1273, 11), 'invalid shape'
    true_cols = [
       'region',
       'Anaplasma Phagocytophillum(кач) ДНК count',
       'Anaplasma Phagocytophillum(кач) ДНК pos perc',
       'Borrelia burgdorferi s.l (кач) ДНК count',
       'Borrelia burgdorferi s.l (кач) ДНК pos perc',
       'Borrelia miyamotoi (кач) ДНК count',
       'Borrelia miyamotoi (кач) ДНК pos perc',
       'Ehrlichia muris/chaffeеnsis(кач) ДНК count',
       'Ehrlichia muris/chaffeеnsis(кач) ДНК pos perc',
       'Вирус клещевого энцефалита(кач) РНК count',
       'Вирус клещевого энцефалита(кач) РНК pos perc']
    
    assert np.array_equal(submission.columns, true_cols), 'invalid cols'
    assert submission.isnull().sum().sum() == 0, 'null values'
    assert (submission.iloc[:, 1:] < 0).sum().sum() == 0, 'negative values'
    print('validation success')

def metric(df_true, submit):
    assert df_true.shape == submit.shape, 'invalid shape'
    mae = 0
    for c in range(1, df_true.shape[1]):
        mae += mean_absolute_error(df_true.iloc[:, c], submit.iloc[:, c])
    return mae

In [111]:
submission = baseline(df)
submission_path = 'submission_example.csv'
submission.to_csv(submission_path)

In [None]:
validate_submission(submission_path)

# just to test metric
true_submit = submission.copy()
true_submit[:] = 0
score = metric(true_submit, submission)
print('score example mae', score)

In [50]:
def submit_file(submission_path):
    files = {'file': (submission_path, open(submission_path,'rb'), 'text/x-spam')}
    TOKEN = ''
    values = {'token': TOKEN}
    return requests.post('http://130.193.54.199:23030/', files=files, data=values)

res = submit_file(submission_path)
res.content

b'Invalid token'

# Adding T data

In [498]:
df['Дата взятия'].min(), df['Дата взятия'].max()

(Timestamp('2019-02-13 07:37:00'), Timestamp('2021-06-08 23:01:00'))

In [180]:
df['date_day'] = df['Дата взятия'].dt.round('1d')
df['is_pos'] = (df['Значение'] != 'НЕ ОБНАР').astype(int)
train = df.groupby(['Регион', 'Имя теста', 'date_day'])[['is_pos']].agg(['sum', 'count'])

In [504]:
regions = df['Регион'].unique()

weather = pd.read_csv('weather.txt', sep="\t|\s{1,}", 
                      names = ['station', 'y', 'm', 'd', 'x1', 't_min', 'x2', 't_mean', 'x3', 't_max', 
                               'x4', 'mm_rain', 'x5', 'x6']) #.drop([['x1', 'x2', 'x3', 'x4', 'x5', 'x6']])
weather = weather.drop(columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6'])
weather['y'].max()

  # DO NOT PUT IMPORTS BEFORE THIS LINE


2020

In [552]:
weather[weather['d']==28]

Unnamed: 0,station,y,m,d,t_min,t_mean,t_max,mm_rain


In [553]:
weather[weather['d']==12]

Unnamed: 0,station,y,m,d,t_min,t_mean,t_max,mm_rain


In [554]:
weather[weather['d']==27].head()

Unnamed: 0,station,y,m,d,t_min,t_mean,t_max,mm_rain
14,20046,2019,2,27,-28.9,-25.6,-23.1,2.9
29,20046,2019,3,27,-23.2,-21.7,-17.6,0.0
44,20046,2019,4,27,-14.9,-12.5,-11.0,0.4
59,20046,2019,5,27,-5.3,-3.9,-1.6,1.9
74,20046,2019,6,27,-1.4,-0.5,0.3,0.8


In [532]:
meteostations = pd.DataFrame(columns=['station', 'place', 'y', 'x', 'h', 'country'])

file1 = open('meteostations.txt', 'r')
Lines = file1.readlines()

count = 0
row = []
for line in Lines:
    count += 1
    row.append(line.strip('\t\t').strip('\n'))
    if count%6==0:
        # print(row)
        meteostations.loc[len(meteostations)] = row
        row = []
        count = 0
        
meteostations_rus = meteostations[(meteostations['country'] == 'Russian Federation') |
                                  (meteostations['country'] == 'Российская Федерация')]

In [533]:
meteostations_rus.head()

Unnamed: 0,station,place,y,x,h,country
2628,20026,Виктория Остров,802,368,9,Российская Федерация
2629,20046,"Остров Хейса Обсерватория Им,Кренкеля",806,581,22,Российская Федерация
2630,20049,Тихая Бухта,804,529,46,Российская Федерация
2631,20066,Ушакова Остров,808,797,47,Российская Федерация
2632,20069,Остров Визе,795,77,10,Российская Федерация


In [734]:
token = "a87a44ca90a14e91d683a798f18c5a69a959b8a7"
secret = "50df242ff0010f79fd62c5123d4d23736dc4af16"
dadata = Dadata(token, secret)



In [735]:
res = dadata.clean(name="address", source="Остров Хейса Обсерватория Им,Кренкеля")
res['region_with_type']



In [261]:
res = dadata.clean(name="address", source="Татарстан Респ")



In [262]:
res['region_with_type']

'Респ Татарстан'

In [263]:
redion_dict = {}

In [267]:
for region in regions:
    reg_standard = dadata.clean(name="address", source=region)['region_with_type']
    redion_dict[reg_standard] = region



In [277]:
locations = list(meteostations_rus['place'])

In [541]:
meteostations_rus.head()

Unnamed: 0,station,place,y_coor,x_coor,h,country,region
2629,20046,"Остров Хейса Обсерватория Им,Кренкеля",806,581,22,Российская Федерация,Татарстан Респ
2630,20049,Тихая Бухта,804,529,46,Российская Федерация,Забайкальский край
2631,20066,Ушакова Остров,808,797,47,Российская Федерация,Орловская обл
2632,20069,Остров Визе,795,77,10,Российская Федерация,Красноярский край
2633,20087,Остров Голомянный,796,906,8,Российская Федерация,Красноярский край


In [279]:
locations_regions = []
for location in locations:
    try:
        res = dadata.clean(name="address", source=location)['region_with_type']
        res_custom = redion_dict[res]
        locations_regions.append(res_custom)
    except:
        locations_regions.append(None)



In [534]:
meteostations_rus['station'] = meteostations_rus['station'].astype(np.int64)
weather['station'] = weather['station'].astype(np.int64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [535]:
meteostations_rus['region'] = locations_regions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [536]:
meteostations_rus = meteostations_rus.rename(columns={"y": "y_coor", "x": "x_coor"})
meteostations_rus = meteostations_rus.dropna(axis=0, subset=['region'])

temps_train = weather.merge(meteostations_rus, on='station', how='inner').drop(columns=['place', 'country', 'station'])
temps_train['x_coor'] = temps_train['x_coor'].str.replace(',', '.').astype(float)
temps_train['y_coor'] = temps_train['y_coor'].str.replace(',', '.').astype(float)
temps_train = temps_train.groupby(['y', 'm', 'd', 'region'])[['t_min', 't_mean', 't_max', 'mm_rain',
                                               'y_coor', 'x_coor', 'h']].agg(['mean'])

In [537]:
temps_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,t_min,t_mean,t_max,mm_rain,y_coor,x_coor
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,mean,mean,mean,mean,mean
y,m,d,region,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2019,2,13,Алтай Респ,-29.350000,-23.000000,-14.775000,0.125000,52.950000,102.675000
2019,2,13,Алтайский край,-26.722222,-19.966667,-12.488889,0.133333,53.633333,92.711111
2019,2,13,Астраханская обл,-7.950000,-3.600000,2.100000,0.000000,47.250000,47.300000
2019,2,13,Башкортостан Респ,-18.912500,-12.000000,-3.537500,0.000000,53.412500,60.175000
2019,2,13,Белгородская обл,-2.200000,0.133333,2.033333,2.333333,50.823333,38.716667
...,...,...,...,...,...,...,...,...,...
2020,6,27,Челябинская обл,5.480000,12.360000,18.940000,0.080000,56.060000,78.980000
2020,6,27,Чеченская Респ,12.600000,20.400000,27.300000,0.000000,43.300000,45.900000
2020,6,27,Чувашская Респ,9.266667,14.333333,20.333333,1.433333,59.326667,63.443333
2020,6,27,Ямало-Ненецкий АО,11.400000,13.800000,16.937500,4.075000,65.975000,75.125000


In [444]:
temps_train.index

MultiIndex([(2019, 2, 13,          'Алтай Респ'),
            (2019, 2, 13,      'Алтайский край'),
            (2019, 2, 13,    'Астраханская обл'),
            (2019, 2, 13,   'Башкортостан Респ'),
            (2019, 2, 13,    'Белгородская обл'),
            (2019, 2, 13,        'Брянская обл'),
            (2019, 2, 13,   'Волгоградская обл'),
            (2019, 2, 13,     'Воронежская обл'),
            (2019, 2, 13,       'Дагестан Респ'),
            (2019, 2, 13,        'Еврейская АО'),
            ...
            (2020, 6, 27,     'Удмуртская Респ'),
            (2020, 6, 27,     'Ульяновская обл'),
            (2020, 6, 27,    'Хабаровский край'),
            (2020, 6, 27,        'Хакасия Респ'),
            (2020, 6, 27, 'Ханты-Мансийский АО'),
            (2020, 6, 27,    'Челябинская обл '),
            (2020, 6, 27,      'Чеченская Респ'),
            (2020, 6, 27,      'Чувашская Респ'),
            (2020, 6, 27,   'Ямало-Ненецкий АО'),
            (2020, 6, 27,     'Яро

In [448]:
temps_train.loc[(2019, 6, 14, 'Ярославская обл')]

t_min    mean     8.55
t_mean   mean    11.80
t_max    mean    17.65
mm_rain  mean    10.15
y_coor   mean    57.40
x_coor   mean    38.75
Name: (2019, 6, 14, Ярославская обл), dtype: float64

In [538]:
temp_test = temps_train.copy()
tmp = temp_test[(temp_test.index.get_level_values(0)==2020) & (temp_test.index.get_level_values(1)==6)]
tmp.to_csv('temps_test.csv')

In [539]:
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,t_min,t_mean,t_max,mm_rain,y_coor,x_coor
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,mean,mean,mean,mean,mean
y,m,d,region,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2020,6,13,Алтай Респ,7.900000,16.275000,25.125000,1.400000,52.950000,102.675000
2020,6,13,Алтайский край,12.000000,21.300000,30.188889,1.144444,53.633333,92.711111
2020,6,13,Астраханская обл,22.150000,31.250000,39.400000,0.000000,47.250000,47.300000
2020,6,13,Башкортостан Респ,9.612500,15.000000,21.775000,2.337500,53.412500,60.175000
2020,6,13,Белгородская обл,21.366667,25.800000,31.166667,1.500000,50.823333,38.716667
2020,6,...,...,...,...,...,...,...,...
2020,6,27,Челябинская обл,5.480000,12.360000,18.940000,0.080000,56.060000,78.980000
2020,6,27,Чеченская Респ,12.600000,20.400000,27.300000,0.000000,43.300000,45.900000
2020,6,27,Чувашская Респ,9.266667,14.333333,20.333333,1.433333,59.326667,63.443333
2020,6,27,Ямало-Ненецкий АО,11.400000,13.800000,16.937500,4.075000,65.975000,75.125000


In [672]:
temps_train_fin = temps_train.reset_index(level=[0,1,2,3]).rename(columns={"('t_min', 'mean')": "t_min", 
                                                "('t_mean', 'mean')": "t_mean",
                                                "('t_max', 'mean')": "t_max",
                                                "('mm_rain', 'mean')": 'mm_rain',
                                                "('y_coor', 'mean')": 'y_coor',
                                                "('x_coor', 'mean')": 'x_coor'}).drop(columns=
                                                                                     ['t_mean', 'mm_rain'])

In [674]:
temps_train_fin.to_csv('temps_train.csv')

In [673]:
temps_train_fin

Unnamed: 0,y,m,d,region,t_min,t_max,y_coor,x_coor
0,2019,2,13,Алтай Респ,-29.350000,-14.775000,52.950000,102.675000
1,2019,2,13,Алтайский край,-26.722222,-12.488889,53.633333,92.711111
2,2019,2,13,Астраханская обл,-7.950000,2.100000,47.250000,47.300000
3,2019,2,13,Башкортостан Респ,-18.912500,-3.537500,53.412500,60.175000
4,2019,2,13,Белгородская обл,-2.200000,2.033333,50.823333,38.716667
...,...,...,...,...,...,...,...,...
8695,2020,6,27,Челябинская обл,5.480000,18.940000,56.060000,78.980000
8696,2020,6,27,Чеченская Респ,12.600000,27.300000,43.300000,45.900000
8697,2020,6,27,Чувашская Респ,9.266667,20.333333,59.326667,63.443333
8698,2020,6,27,Ямало-Ненецкий АО,11.400000,16.937500,65.975000,75.125000


# Adding Humidity data

In [562]:
hum = pd.read_csv('humidity.txt', sep="\t|\s{1,}", 
                      names = ['station', 'y', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]).drop(columns=[1])

  


In [567]:
hum_train = hum.merge(meteostations_rus, on='station', how='inner').drop(columns=['place', 'country', 'station'])

In [570]:
hum_data = []
for index, row in hum_train.iterrows():
    for month in range(2,13):
        hum_data.append([2019, month, row[month], row['y_coor'], row['x_coor'], row['h'], row['region']])

In [572]:
hum_train1 = pd.DataFrame(hum_data, columns=['y', 'm', 'humidity', 'y_coor', 'x_coor', 'h', 'region'])

In [576]:
hum_train1.groupby(['y', 'm', 'region'])[['humidity', 'y_coor', 'x_coor', 'h']].agg(['mean'])

In [659]:
# hum_train2 = hum_train2.reset_index(level=[0,1,2]).rename(columns={"('humidity', 'mean')": "humidity", })

IndexError: Too many levels: Index has only 1 level, not 2

In [660]:
hum_train2

Unnamed: 0,y,m,region,humidity
0,2019,2,Алтай Респ,64.250000
1,2019,2,Алтайский край,70.222222
2,2019,2,Астраханская обл,82.500000
3,2019,2,Башкортостан Респ,77.125000
4,2019,2,Белгородская обл,83.333333
...,...,...,...,...
633,2019,12,Челябинская обл,75.500000
634,2019,12,Чеченская Респ,88.000000
635,2019,12,Чувашская Респ,82.333333
636,2019,12,Ямало-Ненецкий АО,81.166667


In [661]:
hum_train2.to_csv('humidity_train.csv')

# T + humidity; test

In [586]:
len(regions)

67

In [587]:
regions

array(['Челябинская обл ', 'Москва', 'Новосибирская обл',
       'Московская область', 'Липецкая обл', 'Свердловская обл',
       'Владимирская обл', 'Калининградская обл', 'Курская обл',
       'Смоленская обл', 'Красноярский край', 'Северная Осетия Респ',
       'Тульская обл', 'Ярославская обл', 'Калужская обл',
       'Орловская обл', 'Ростовская обл', 'Ставропольский край',
       'Оренбургская обл', 'Рязанская обл', 'Костромская обл',
       'Самарская обл', 'Нижегородская обл', 'Хакасия Респ',
       'Саратовская обл', 'Курганская обл', 'Ульяновская обл',
       'Тюменская обл ', 'Белгородская обл', 'Воронежская обл',
       'Брянская обл', 'Ханты-Мансийский АО', 'Ивановская обл',
       'Пензенская обл', 'Чувашская Респ', 'Тамбовская обл', 'Коми Респ',
       'Кемеровская обл', 'Татарстан Респ', 'Томская обл',
       'Краснодарский край', 'Башкортостан Респ', 'Хабаровский край',
       'Алтайский край', 'Тыва Респ', 'Забайкальский край',
       'Удмуртская Респ', 'Волгоградская

In [794]:
r = requests.get('https://pogoda.mail.ru/prognoz/moskva/june-2021/')
parsed = r.text
days = parsed.split('day__date__more')[1:]
res = []

for i in range(30):
    day=i+1
    ex = re.split('"day__temperature ">|&deg|"day__temperature__night">|Влажность: |%">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t<span class="icon icon_humidity'
         ,days[i])
    t_up, t_down, humidity = itemgetter(*[1,3,5])(ex)
    res.append([2021, 6, day, 'Москва', int(t_down), int(t_up), int(humidity)])

IndexError: list index out of range

In [789]:
parsed

'{"captchaPath":"/captcha/img/6","isRlimitedAgain":false}\n'

In [651]:
meteo_test = pd.DataFrame(res, columns=['y', 'm', 'd', 'region', 't_down', 't_up', 'humidity'])

In [652]:
meteo_test

Unnamed: 0,y,m,d,region,t_down,t_up,humidity
0,2021,6,1,Москва,4,15,49
1,2021,6,2,Москва,7,18,44
2,2021,6,3,Москва,7,19,39
3,2021,6,4,Москва,9,20,37
4,2021,6,5,Москва,10,21,44
5,2021,6,6,Москва,12,21,60
6,2021,6,7,Москва,14,22,64
7,2021,6,8,Москва,13,23,91
8,2021,6,9,Москва,14,20,91
9,2021,6,10,Москва,12,19,86


In [683]:
cities_pd = pd.read_csv('cities.csv')

In [684]:
cities = list(cities_pd['0'])

In [688]:
def transliterate(name):
    name = name.lower()
    slovar = {'а':'a','б':'b','в':'v','г':'g','д':'d','е':'e','ё':'yo',
      'ж':'zh','з':'z','и':'i','й':'i','к':'k','л':'l','м':'m','н':'n',
      'о':'o','п':'p','р':'r','с':'s','т':'t','у':'u','ф':'f','х':'h',
      'ц':'c','ч':'ch','ш':'sh','щ':'sch','ъ':'','ы':'y','ь':'','э':'e',
      'ю':'u','я':'ya', 'А':'A','Б':'B','В':'V','Г':'G','Д':'D','Е':'E','Ё':'YO',
      'Ж':'ZH','З':'Z','И':'I','Й':'I','К':'K','Л':'L','М':'M','Н':'N',
      'О':'O','П':'P','Р':'R','С':'S','Т':'T','У':'U','Ф':'F','Х':'H',
      'Ц':'C','Ч':'CH','Ш':'SH','Щ':'SCH','Ъ':'','Ы':'y','Ь':'','Э':'E',
      'Ю':'U','Я':'YA',',':'','?':'',' ':'_','~':'','!':'','@':'','#':'',
      '$':'','%':'','^':'','&':'','*':'','(':'',')':'','-':'-','=':'','+':'',
      ':':'',';':'','<':'','>':'','\'':'','"':'','\\':'','/':'','№':'',
      '[':'',']':'','{':'','}':'','ґ':'','ї':'', 'є':'','Ґ':'g','Ї':'i',
      'Є':'e', '—':''}
    for key in slovar:
        name = name.replace(key, slovar[key])
    return name

In [737]:
cities_new = []
for name in cities:
    cities_new.append(transliterate(name))

In [741]:
dict_city_inv_region = {}
for city in cities_new:
    res = dadata.clean(name="address", source=city)
    dadata_name = res['region_with_type']
    try:
        reg_invitro = redion_dict[dadata_name]
        dict_city_inv_region[city] = reg_invitro
    except:
        pass



In [778]:
len(list(dict_city_inv_region.keys()))

386

In [796]:
import time

res = []
found = 0
not_found = 0

for city in tqdm(list(dict_city_inv_region.keys())):
    
    time.sleep(5)
    
    region = dict_city_inv_region[city]
    url = 'https://pogoda.mail.ru/prognoz/' + city + '/june-2021/'
    
    r = requests.get(url)
    parsed = r.text
    
    if not ('страница не найдена' in parsed):
        found += 1
    
        days = parsed.split('day__date__more')[1:]

        for i in range(30):
            day=i+1
            ex = re.split('"day__temperature ">|&deg|"day__temperature__night">|Влажность: |%">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t<span class="icon icon_humidity'
                 ,days[i])
            t_up, t_down, humidity = itemgetter(*[1,3,5])(ex)
            res.append([2021, 6, day, region, city, int(t_down), int(t_up), int(humidity)])
            
    else: 
        not_found += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=386.0), HTML(value='')))




In [799]:
meteo_test_df = pd.DataFrame(res, columns=['y', 'm', 'd', 'region', 'city', 't_down', 't_up', 'humidity'])

In [801]:
meteo_test_df = meteo_test_df.drop(columns='city')

In [804]:
meteo_test_df1 = meteo_test_df.groupby(['y', 'm', 'd', 'region'])[['t_down', 't_up', 'humidity']].agg(['mean'])

In [805]:
meteo_test_df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,t_down,t_up,humidity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,mean,mean
y,m,d,region,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2021,6,1,Алтайский край,13.000000,26.000000,36.000000
2021,6,1,Башкортостан Респ,9.166667,20.833333,60.166667
2021,6,1,Белгородская обл,7.800000,16.200000,71.000000
2021,6,1,Брянская обл,4.000000,17.000000,49.666667
2021,6,1,Владимирская обл,4.714286,15.000000,51.000000
2021,6,...,...,...,...,...
2021,6,30,Ханты-Мансийский АО,12.000000,24.750000,41.750000
2021,6,30,Челябинская обл,18.647059,31.176471,44.588235
2021,6,30,Чувашская Респ,20.000000,26.000000,68.000000
2021,6,30,Ямало-Ненецкий АО,10.000000,19.000000,46.000000


In [None]:
"('t_mean', 'mean')": "t_mean"

In [839]:
meteo_test_df2 = meteo_test_df1.reset_index(level=[0,1,2,3])

In [840]:
meteo_test_df2.columns = ['y', 'm', 'd', 'region', 't_down', 't_up', 'humidity']

In [841]:
meteo_test_df2

Unnamed: 0,y,m,d,region,t_down,t_up,humidity
0,2021,6,1,Алтайский край,13.000000,26.000000,36.000000
1,2021,6,1,Башкортостан Респ,9.166667,20.833333,60.166667
2,2021,6,1,Белгородская обл,7.800000,16.200000,71.000000
3,2021,6,1,Брянская обл,4.000000,17.000000,49.666667
4,2021,6,1,Владимирская обл,4.714286,15.000000,51.000000
...,...,...,...,...,...,...,...
1705,2021,6,30,Ханты-Мансийский АО,12.000000,24.750000,41.750000
1706,2021,6,30,Челябинская обл,18.647059,31.176471,44.588235
1707,2021,6,30,Чувашская Респ,20.000000,26.000000,68.000000
1708,2021,6,30,Ямало-Ненецкий АО,10.000000,19.000000,46.000000


In [837]:
len(regions)

67

In [838]:
1710/30

57.0

In [832]:
meteo_test_df2.to_csv('meteo_test.csv')