In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import json
import requests
import optuna
import shap
import joblib
import yaml

from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
from typing import Dict
from dateutil.parser import parse
from scipy import stats
from optuna.visualization import plot_optimization_history

# 1. Import

In [12]:
params = yaml.load(open('../config/params.yml'), Loader=yaml.FullLoader)

with open(params['preprocessing']['uniq_features_path']) as json_file:
    column_sequence = json.load(json_file)

In [7]:
df = pd.read_csv(params['evaluate']['predict_path'], parse_dates=['date'])

In [8]:
df.head(2)

Unnamed: 0,date,hour,count_order
0,2023-01-01,0,18
1,2023-01-01,1,14


# 2. Preprocessing

In [9]:
js = json.loads(requests.get(params['preprocessing']['holidays_calendar']).text)
for x in js:
    df[x] = df['date'].isin(pd.to_datetime(pd.DataFrame(js[x], columns=['date']).date, format="%Y-%m-%d"))

In [10]:
def uniq_features(
        data: pd.DataFrame, drop_col: list, target_col: str, uniq_features_path: str) -> None:
        """
        Функция для сохранение словаря с признаками и уникальными значениями
        :param data: подаваемый датафрейм
        :param drop_col: список с названиями признаков для удаления
        :param target_col: целевая переменная
        :param uniq_features_path: путь для сохранения
        :return: None
        """
        unique_df = data.drop(
            columns=drop_col + [target_col], axis=1, errors="ignore"
        )
        dict_unique = {key: unique_df[key].unique().tolist() for key in unique_df.columns}
        with open(uniq_features_path, "w") as file:
            json.dump(dict_unique, file)

def map_values(data: pd.DataFrame, map_columns: dict) -> pd.DataFrame:
        """
        функция для замены значений в колонках
        :param data: подаваемый DataFrame
        :param map_columns: словарь с заменяемыми значениями
        :return: датафрейм
        """
        return data.replace(map_columns)

def astype_col(data: pd.DataFrame, map_type_columns: dict) -> pd.DataFrame:
        """
        функция для замены типов данных в колонках
        :param data: подаваемый DataFrame
        :param map_type_columns: словарь с колонками и типом данных
        :return: датафрейм
        """
        return data.astype(map_type_columns)

def check_columns(data: pd.DataFrame, values_path: str) -> pd.DataFrame:
    """
    Проверка признаков на соответствие train
    :param data: датафрейм test
    :param unique_values_path: путь к списку с признаками train
    :return: датафрейм test
    """
    with open(values_path) as json_file:
        unique_values = json.load(json_file)

    column = unique_values.keys()

    assert set(column) == set(data.columns), "Не соответствуют признаки"
    return data[column]

In [22]:
def pipline(data: pd.DataFrame, flag_eval: bool=True, **kwargs):
    """
    Функция предобработки данных
    :param data: подаваемый датафрейм
    :param flag_eval: флаг датфрейма - для обучения или для предсказания
    :return: датфрейм для обучения/предсказания
    """ 
    js = json.loads(requests.get(kwargs['holidays_calendar']).text)
    for x in js:
        df[x] = df['date'].isin(pd.to_datetime(pd.DataFrame(js[x], columns=['date']).date, format="%Y-%m-%d"))

    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['day'] = df.date.dt.day
    df['day_name'] = df.date.dt.day_name()
    
    data = map_values(
        data=data, map_columns=kwargs['map_col']
    )
    data = astype_col(
        data=data, map_type_columns=kwargs['map_type_col']
    )
    data['period_day'] = data.hour.map(kwargs['period_day'])
    data['season'] = data.month.map(kwargs['season'])

    data = data.drop(kwargs['drop_columns'], axis=1, errors="ignore")

    if flag_eval:
        data = check_columns(
            data=data, values_path=kwargs['uniq_features_path']
        )
    else:
        uniq_features(
            data=data,
            drop_columns=kwargs['drop_columns'],
            target_column=kwargs['target_column'],
            unique_values_path=kwargs['uniq_features_path'],
        )
        
    cat_cols = data.select_dtypes('object').columns
    data[cat_cols] = data[cat_cols].astype("category")
    
    return data

In [29]:
df_test = pipline(data=df, **params['preprocessing'])

# 3. Evaluate

In [21]:
df_test.head(2)

Unnamed: 0,hour,holidays,preholidays,nowork,year,month,day,day_name,period_day,season
0,0,1,0,0,2023,January,1,Sunday,hours_night,winter
1,1,1,0,0,2023,January,1,Sunday,hours_night,winter


In [31]:
model = joblib.load(params['train']['model_path'])
model.predict(df_test).round()

array([23., 16., 12., ..., 72., 54., 35.])

In [32]:
df_test['predict'] = model.predict(df_test).round()

In [33]:
df_test

Unnamed: 0,hour,holidays,preholidays,nowork,year,month,day,day_name,period_day,season,predict
0,0,1,0,0,2023,January,1,Sunday,hours_night,winter,23.0
1,1,1,0,0,2023,January,1,Sunday,hours_night,winter,16.0
2,2,1,0,0,2023,January,1,Sunday,hours_night,winter,12.0
3,3,1,0,0,2023,January,1,Sunday,hours_night,winter,10.0
4,4,1,0,0,2023,January,1,Sunday,hours_morning,winter,8.0
...,...,...,...,...,...,...,...,...,...,...,...
8000,19,0,0,0,2023,November,30,Thursday,hours_evening,autumn,81.0
8001,20,0,0,0,2023,November,30,Thursday,hours_evening,autumn,81.0
8002,21,0,0,0,2023,November,30,Thursday,hours_evening,autumn,72.0
8003,22,0,0,0,2023,November,30,Thursday,hours_evening,autumn,54.0
