# Подключим необходимые библиотеки

In [None]:
!pip install -U lightautoml
!pip install catboost
!pip install Prophet

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import *
from collections import defaultdict
from sklearn.linear_model import LinearRegression
import catboost as cb
from prophet import Prophet
import os, shutil, pickle
import torch 
from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
BASE = '/content/drive/MyDrive/AIML/hackathon_digital_breakthrough/'

# Посмотрим на данные и обработаем их

In [None]:
df = pd.read_csv(BASE + 'data_processed.csv')
df.isna().sum()

call_date                0
call_number              0
patient_age              0
call_initiator       48076
patient_address      72599
call_reason          13592
call_order             361
call_type             7428
patient_diagnosis     3842
call_result             99
hospitalized_to         85
substation              13
call_time               90
arrival_time            92
dtype: int64

In [None]:
def preproc(df): # первичная обработка данных
    full_dt = []
    df = df[df['call_time'].notna()]
    df = df[df['substation'].notna()]
    df = df[df['hospitalized_to'].notna()]
    ct_time = df['call_time']
    ct_date = df['call_date']
    ct_num = df['call_number']
    for i in tqdm(range(len(df))):
        cur_id = ct_num.iloc[i]
        if '1970' not in str(ct_time.iloc[i])[:-5]:
            full_dt.append(str(ct_date.iloc[i]) + ' ' + str(ct_time.iloc[i])[:-5] + '00:00')
        else:
            full_dt.append(np.nan)
        df['call_number'].iloc[i] = cur_id.split('(')[0]
    df = df.drop(columns=['call_time', 'call_date'])
    df['date_time'] = full_dt
    df = df[df['date_time'].notna()]
    df['date_time'] = pd.to_datetime(df['date_time'])
    return df

df = preproc(df)
df.head()

  0%|          | 0/163875 [00:00<?, ?it/s]

Unnamed: 0,call_number,patient_age,call_initiator,patient_address,call_reason,call_order,call_type,patient_diagnosis,call_result,hospitalized_to,substation,arrival_time,date_time
0,8,26 лет,"родственник, проживающий совместно с больным",,"Температура, взрослый",Первичный,неотложное состояние,ОРВИ. Гипертермический синдром,"оказана помощь, больной оставлен на месте","ГБУЗ НО ""Городская поликлиника №7"" (ул. Верхне...",ПСМП №7,09:52:23,2020-01-01 09:00:00
1,16,68 лет,больной или пострадавший,"г.Нижний Новгород, ш. Казанское, д.9, кв***",Болит живот (взрослый),Первичный,неотложное состояние,Люмбалгия,"оказана помощь, больной оставлен на месте","ГБУЗ НО ""Городская поликлиника №7"" (ул. Верхне...",ПСМП №7,10:58:40,2020-01-01 10:00:00
2,21,1 лет,больной или пострадавший,"г.Нижний Новгород, ул. Родионова, д.167, корп....",,Первичный,неотложное состояние,Острый бронхит,"оказана помощь, больной оставлен на месте","ГБУЗ НО ""Детская городская поликлиника №22"" (у...",ПСМП №7,12:17:40,2020-01-01 12:00:00
3,28,8 лет,"родственник, проживающий совместно с больным",,Болит живот (ребенок),Первичный,внезапное заболевание,Другие болезни желчевыводящих путей,"оказана помощь, больной оставлен на месте","ГБУЗ НО ""Детская городская поликлиника №22"" (у...",ПСМП №7,13:53:13,2020-01-01 13:00:00
4,29,79 лет,больной или пострадавший,"г.Нижний Новгород, ул. Богдановича, д.1, кв.***",Болит живот (взрослый),Первичный,неотложное состояние,Гастрит,"оказана помощь, больной оставлен на месте","ГБУЗ НО ""Городская поликлиника №7"" (ул. Верхне...",ПСМП №7,14:08:03,2020-01-01 13:00:00


In [None]:
def preproc2(df): # подготовка данных для обучения
    enc_pds = {x: i for i, x in enumerate(pd.unique(df['substation']))}
    times = defaultdict(list)
    min_time = None
    max_time = None
    for i in tqdm(range(len(df))):
        cur_time = df['date_time'].iloc[i]
        if min_time is None:
            min_time = cur_time
            max_time = cur_time
        if min_time > cur_time:
            min_time = cur_time
        if max_time < cur_time:
            max_time = cur_time
    while min_time <= max_time:
        times[min_time] = [0 for _ in range(len(enc_pds))]
        min_time += pd.to_timedelta(1, unit='h')
    for i in tqdm(range(len(df))):
        cur_time = df['date_time'].iloc[i]
        cur_pds = df['substation'].iloc[i]
        times[cur_time][enc_pds[cur_pds]] += 1
    times = [[x, *times[x]] for x in times]
    times = pd.DataFrame(times, columns = ['date'] + [x for x in enc_pds])
    
    return times

ft = preproc2(df)

  0%|          | 0/163871 [00:00<?, ?it/s]

  0%|          | 0/163871 [00:00<?, ?it/s]

In [None]:
sms = []
for i in range(len(ft)):
    cur = ft.iloc[i].drop('date').sum()
    sms.append(cur)
print(np.mean(sms), np.max(sms), np.min(sms)) # посмотрим на среднее кол-во вызовов в час

7.811192144525478 39 0


# Обучим модели

In [None]:
# выделим категориальные фичи для катбуста
CAT_COLUMNS = ['hour', 'day', 'month', 'day_of_week', 'is_weekend', 'is_winter', 'is_spring', 'is_summer', 'is_autumn', 'is_afternoon', \
               'is_early_morning', 'is_morning', 'is_noon', 'is_eve', 'is_night', 'is_late_night'] 

# дополнительные фичи
def get_funcs():
    res = []
    for i in range(1, 6):
        res.append(lambda x: np.sin(i*x))
        res.append(lambda x: np.cos(i*x))
        res.append(lambda x: np.tanh(i*x))
        res.append(lambda x: np.sin(x/i))
        res.append(lambda x: np.cos(x/i))
        res.append(lambda x: np.tanh(x/i))
    return res

# выделение фичей
def make_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['hour'] = df['date'].dt.hour
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek

    df['is_weekend'] = (df['day_of_week'] > 4).astype('int16') 

    df['is_winter'] = ((df['month'] >=12) & (df['month'] <= 2)).astype('int16')
    df['is_spring'] = ((df['month'] > 2) & (df['month'] <= 5)).astype('int16')
    df['is_summer'] = ((df['month'] > 5) & (df['month'] <= 8)).astype('int16')
    df['is_autumn'] = ((df['month'] > 8) & (df['month'] <= 11)).astype('int16')    

    df['is_afternoon'] = (df['hour'] > 12).astype('int16') 
    df['is_early_morning'] = ((df['hour'] > 4) & (df['hour'] <= 8)).astype('int16') 
    df['is_morning'] = ((df['hour'] > 8) & (df['hour'] <= 12)).astype('int16') 
    df['is_noon'] = ((df['hour'] > 12) & (df['hour'] <= 17)).astype('int16')
    df['is_eve'] = ((df['hour'] > 18) & (df['hour'] <= 21)).astype('int16') 
    df['is_night'] = ((df['hour'] > 21) & (df['hour'] <= 24)).astype('int16')
    df['is_late_night'] = (df['hour'] <= 4).astype('int16') 

    funcs = get_funcs()
    for func in funcs:
        for col in ['hour', 'day', 'month']:
            df[f"{func}_{col}"] = func(df[col])
    for col in ['hour', 'day', 'month']:
       df[col] = df[col].astype(object)
    return df.drop(columns=['date'])

In [None]:
# параметры для lama
N_THREADS = 40
N_FOLDS = 3
RANDOM_STATE = 56
TEST_SIZE = 0.2
TIMEOUT = 100


np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

task = Task('reg')

In [None]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5


# функция для обучения и подсчета метрики катбуста
def fit_predict_catboost(features):
    scores = []
    test_size = int(0.04 * len(features))
    for col in tqdm(features.columns[1:]):
        train = features[['date', col]]
        train = make_features(train)
        train, test = train.iloc[:-test_size], train.iloc[-test_size:]
        X_train, y_train = train[[x for x in train if x !=col]], train[col]
        X_val, y_val = test[[x for x in train if x != col]], test[col]
        
        seeds = [0, 42, 56, 337, 7575]
        models = []
        for sd in seeds:
                model = cb.CatBoostRegressor(random_seed=sd,
                                  iterations=200,
                                  verbose=0,
                                  max_depth=5,
                                  eval_metric='RMSE',
                                  cat_features=CAT_COLUMNS,
                                  
                                )
                model.fit(X_train, y_train,
                eval_set=(X_val, y_val)
                )
                models.append(model)
        preds = np.mean([model.predict(X_val) for model in models], axis=0)
        scores.append(rmse(y_val, preds))

        os.mkdir(f"models_new/{col}")
        for j, model in enumerate(models):
            with open(f"models_new/{col}/model_{j}.pkl", 'wb') as f:
                pickle.dump(model, f)
        print(scores[-1])
    return scores

# функция для обучения и подсчета метрики prophet
def fit_predict_prophet(features):
      scores = []
      test_size = int(0.04 * len(features))
      for col in tqdm(features.columns[1:]):
          train = features[['date', col]]
          nw = []
          for x in train['date']:
              q = str(x).split()
              q[0] = q[0].split('.')
              q[0] = '-'.join(q[0][::-1])
              q = ' '.join(q)
              nw.append(q)
          train['date'] = nw
          train.columns = ['ds', 'y']
          train, test = train.iloc[:-test_size], train.iloc[-test_size:]
          model = Prophet()
          model.fit(train)
          future = model.make_future_dataframe(periods=test_size, freq='H')
          forecast = model.predict(future)
          scores.append(rmse(test['y'], forecast['yhat'].iloc[-test_size:]))
          print(scores[-1])
      return scores
          
# функция для обучения и подсчета метрики lama
def fit_predict_lama(features):
    scores = []
    test_size = int(0.04 * len(features))
    for col in tqdm(features.columns[1:]):
        train = features[['date', col]]
        train = make_features(train)
        train, test = train.iloc[:-test_size], train.iloc[-test_size:]
        X_train, y_train = train[[x for x in train if x !=col]], train[col]
        X_val, y_val = test[[x for x in train if x != col]], test[col]
        

        TARGET_NAME = col
        roles = {
            'target': TARGET_NAME,
        }
        automl = TabularUtilizedAutoML(
            task = task, 
            timeout = TIMEOUT,
            cpu_limit = N_THREADS,
            reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
            
        )
        oof_pred = automl.fit_predict(train, roles=roles, verbose=False)
        preds = automl.predict(X_val)
        scores.append(rmse(y_val, preds.data[:, 0]))
        print(scores[-1])
    return scores

In [None]:
os.mkdir(f"models_new") # создадим папку для моделей
scores_catboost = fit_predict_catboost(ft)


  0%|          | 0/54 [00:00<?, ?it/s]

0.75883271822414
0.569990163312291
0.670342602871509
0.9412104474352041
0.60741639024224
0.7342986708069221
0.6928076187395168
0.7803253035699413
0.8302250081423628
0.0031614896592125304
0.001347746639204048
0.5352100005070768
0.5647946362568785
0.6418301701930218
0.0004511811919690136
0.5527158717932015
0.29093134384097374
0.39967744322021836
0.7042329867756021
0.2153998806719148
0.2846830357399681
0.22832037559343957
0.31665937664858723
0.7430932424465583
0.24019587716587984
0.048629280138317625
0.030145398825419122
0.137255451006595
0.0768808254191057
0.0689490889969925
0.034514701911313755
0.0027786868317629166
0.37887149010282517
0.13734580563269097
0.048767558009954595
0.10889421597464882
0.007551253183073001
0.32488428981719364
0.005739910305009322
0.0003873175225191109
0.4388120442692702
0.00629128073206788
0.4192334452423986
0.1532579451651918
0.004220413364217484
0.00535585067782069
0.3286676130401485
0.000535013238769141
0.002444480990877313
0.9200989835295892
0.181509313014

In [None]:
scores_prophet = fit_predict_prophet(ft)

  0%|          | 0/54 [00:00<?, ?it/s]

0.7660973489741744
0.5626061634300432
0.6788353826235204
0.9683829332712266
0.6086614094222214
0.7484834368986149
0.6956259301744121
0.8085554002263436
0.8554341890064328
0.0013746994610753563
0.0012287209795199304
0.5131806762684753
0.5922470930113792
0.6438505624338038
0.0007414223425163512
0.5262283325462533
0.2971202088992978
0.40046899176447365
0.6940611334209704
0.24620168811471954
0.2777650440786868
0.23283988382339296
0.3168081801017809
0.6782045529575986
0.24463970492931675
0.049725116132697776
0.010483107130584906
0.1366960435315175
0.07686959062261228
0.06945003845383059
0.03493790719245176
0.005602637072901784
0.3671221019367607
0.1377793328415437
0.04957754750879077
0.10846893193648222
0.026446835556738782
0.3221263551442079
0.00032363383015340976
0.0002099073471579904
0.4185532973588835
0.0006160989653475241
0.4046418247297087
0.15517327083232463
0.0006681834589354824
0.0024044952281034094
0.326482136496066
0.0001968128267295641
0.00018931611816337
0.8325460874464278
0.18

In [None]:
scores_lama = fit_predict_lama(ft)

  0%|          | 0/54 [00:00<?, ?it/s]

0.7651390499910542
0.5898775697146192
0.6724698970756512
0.9503362455988925
0.609061199512967
0.740015453042807
0.6937820703260078
0.7846694735937241
0.8384717572389563
0.0014129332411979949
6.527585337525539e-05
0.5470324861566044
0.5701409442889733
0.668091984492073
0.0018927896329853447
0.5725129292623444
0.29231924568728646
0.4048731744851454
0.700912569642926
0.22739868747067446
0.28605335668012954
0.22995850386879205
0.322692595457714
0.8384822134191531
0.24113342841780844
0.04873048665251819
0.007304620096186802
0.137700034714273
0.07700676400141893
0.06886578900758723
0.034817285687511176
0.0016312669113304292
0.38442277042478284
0.13764345572064965
0.04878284366333614
0.10897116028864026
0.003483237794226075
0.334183190722213
0.00012325546237373816
4.6365174796108856e-05
0.44574397421671835
0.00024254737544548736
0.4269361670645184
0.15328795892968566
0.00013346003885558074
0.0010960768846072775
0.3382026925132848
4.46576447045614e-05


Trial 0 failed because of the following error: CatBoostError('catboost/libs/metrics/metric.cpp:6431: All train targets are equal')
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.7/dist-packages/lightautoml/ml_algo/tuning/optuna.py", line 213, in objective
    output_dataset = _ml_algo.fit_predict(train_valid_iterator=train_valid_iterator)
  File "/usr/local/lib/python3.7/dist-packages/lightautoml/ml_algo/base.py", line 271, in fit_predict
    model, pred = self.fit_predict_single_fold(train, valid)
  File "/usr/local/lib/python3.7/dist-packages/lightautoml/ml_algo/boost_cb.py", line 306, in fit_predict_single_fold
    model.fit(cb_train, eval_set=cb_valid, log_cout=LoggerStream(logger, verbose_eval=100))
  File "/usr/local/lib/python3.7/dist-packages/catboost/core.py", line 2430, in fit
    save_snapshot, snapshot_file, snapshot_interva

3.278039655869787e-05
0.9284336435977918
0.182321415595749


Trial 0 failed because of the following error: CatBoostError('catboost/libs/metrics/metric.cpp:6431: All train targets are equal')
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.7/dist-packages/lightautoml/ml_algo/tuning/optuna.py", line 213, in objective
    output_dataset = _ml_algo.fit_predict(train_valid_iterator=train_valid_iterator)
  File "/usr/local/lib/python3.7/dist-packages/lightautoml/ml_algo/base.py", line 271, in fit_predict
    model, pred = self.fit_predict_single_fold(train, valid)
  File "/usr/local/lib/python3.7/dist-packages/lightautoml/ml_algo/boost_cb.py", line 306, in fit_predict_single_fold
    model.fit(cb_train, eval_set=cb_valid, log_cout=LoggerStream(logger, verbose_eval=100))
  File "/usr/local/lib/python3.7/dist-packages/catboost/core.py", line 2430, in fit
    save_snapshot, snapshot_file, snapshot_interva

3.359951397811872e-05
3.305800902679087e-05
0.09737806966889818


In [None]:
print(np.mean(scores_catboost))

0.30143545121873166


In [None]:
print(np.mean(scores_prophet))

0.2996447741607458


In [None]:
print(np.mean(scores_lama))

0.3058597863319171


RMSE CATBOOST 0.30143545121873166

RMSE Prophet 0.2996447741607458

RMSE Lama 0.3058597863319171


In [None]:
import shutil
shutil.make_archive('models', 'zip', '/content/models')

'/content/models.zip'