In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import json
import optuna
import shap
import joblib
import yaml

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
from typing import Dict
from dateutil.parser import parse
from scipy import stats

In [2]:
config_path = '../config/params.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config['preprocessing']
training = config['train']
evaluate = config['evaluate']

column_sequence_path = preproc['unique_values_path']
with open(column_sequence_path) as json_file:
    column_sequence = json.load(json_file)

# 1. Import

In [7]:
data_test = pd.read_csv(evaluate['predict_path'])

In [9]:
data_test.head(2)

Unnamed: 0,AZS_NAME,PROD_NAME,cnt_ks,cnt_tank,cnt_trk,cnt_hand,region,location,year,month,day,day_of_week
0,AZS 557,gasoline_ai_92,2,1,4,4,Vologda_region,federal_highway,2022,4,2,Saturday
1,AZS 557,gasoline_ai_95,2,1,4,4,Vologda_region,federal_highway,2022,4,2,Saturday


In [8]:
data_test.drop(preproc["drop_columns"], axis=1, errors="ignore")

Unnamed: 0,AZS_NAME,PROD_NAME,cnt_ks,cnt_tank,cnt_trk,region,location,year,month,day,day_of_week
0,AZS 557,gasoline_ai_92,2,1,4,Vologda_region,federal_highway,2022,4,2,Saturday
1,AZS 557,gasoline_ai_95,2,1,4,Vologda_region,federal_highway,2022,4,2,Saturday
2,AZS 557,diesel_fuel,2,1,4,Vologda_region,federal_highway,2022,4,2,Saturday
3,AZS 557,diesel_fuel_taneco,2,1,4,Vologda_region,federal_highway,2022,4,2,Saturday
4,AZS 557,gasoline_ai_92,3,1,4,Vologda_region,federal_highway,2022,4,3,Sunday
...,...,...,...,...,...,...,...,...,...,...,...
8000,AZS 573,gasoline_ai_95,2,1,4,Arkhangelsk_region,"city_0,25_0,5",2022,12,30,Friday
8001,AZS 573,diesel_fuel,2,1,4,Arkhangelsk_region,"city_0,25_0,5",2022,12,30,Friday
8002,AZS 573,gasoline_ai_92,2,2,4,Arkhangelsk_region,"city_0,25_0,5",2022,12,31,Saturday
8003,AZS 573,gasoline_ai_95,2,1,4,Arkhangelsk_region,"city_0,25_0,5",2022,12,31,Saturday


In [4]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8005 entries, 0 to 8004
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   AZS_NAME     8005 non-null   object
 1   PROD_NAME    8005 non-null   object
 2   cnt_ks       8005 non-null   int64 
 3   cnt_tank     8005 non-null   int64 
 4   cnt_trk      8005 non-null   int64 
 5   cnt_hand     8005 non-null   int64 
 6   region       8005 non-null   object
 7   location     8005 non-null   object
 8   year         8005 non-null   int64 
 9   month        8005 non-null   int64 
 10  day          8005 non-null   int64 
 11  day_of_week  8005 non-null   object
dtypes: int64(7), object(5)
memory usage: 750.6+ KB


# 2. Preprocessing  

In [125]:
def replace_values(data: pd.DataFrame, map_change_columns: dict) -> pd.DataFrame:
    """
    Замена значений в датасете
    :param data: датасет
    :param map_change_columns: словарь с признаками и значениями
    :return: датасет
    """
    return data.replace(map_change_columns)


def get_bins(data: (int, float), first_val: (int, float),
             second_val: (int, float)) -> str:
    """
    Генерация бинов для разных признаков
    :param data: датасет
    :param first_val: первый порог значения для разбиения на бины
    :param second_val: второй порог значения для разбиения на бины
    :return: датасет
    """
    assert isinstance(data, (int, float)), "Проблема с типом данных в признаке"
    result = ("2_and_less" if data <= first_val else
              "3_and_4" if first_val < data <= second_val else "more 4")
    return result


def check_columns_evaluate(data: pd.DataFrame, unique_values_path: str) -> pd.DataFrame:
    """
    Проверка на наличие признаков из train и упорядочивание признаков согласно train
    :param data: датасет test
    :param unique_values_path: путь до списока с признаками train для сравнения
    :return: датасет test
    """
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)

    column_sequence = unique_values.keys()

    assert set(column_sequence) == set(data.columns), "Разные признаки"
    return data[column_sequence]


def transform_types(data: pd.DataFrame, change_type_columns: dict) -> pd.DataFrame:
    """
    Преобразование признаков в заданный тип данных
    :param data: датасет
    :param change_type_columns: словарь с признаками и типами данных
    :return:
    """
    return data.astype(change_type_columns, errors="raise")

In [126]:
# клендарь с выходными и праздничными днями
# file = open(preproc['calendar_path'])
# text = file.read()
# text = json.loads(text)

# holidays = pd.json_normalize(text, 'holidays')
# preholidays = pd.json_normalize(text, 'preholidays')
# nowork = pd.json_normalize(text, 'nowork')

# nowork['nowork'] = 1
# nowork.rename(columns={0: 'FL_EVBDATE'}, inplace=True)
# nowork['FL_EVBDATE'] = pd.to_datetime(nowork['FL_EVBDATE'], format='%Y-%m-%d')
# nowork.set_index('FL_EVBDATE', inplace=True)
# preholidays['preholidays'] = 1
# preholidays.rename(columns={0: 'FL_EVBDATE'}, inplace=True)
# preholidays['FL_EVBDATE'] = pd.to_datetime(preholidays['FL_EVBDATE'], format='%Y-%m-%d')
# preholidays.set_index('FL_EVBDATE', inplace=True)
# holidays['holidays'] = 1
# holidays.rename(columns={0: 'FL_EVBDATE'}, inplace=True)
# holidays['FL_EVBDATE'] = pd.to_datetime(holidays['FL_EVBDATE'], format='%Y-%m-%d')
# holidays.set_index('FL_EVBDATE', inplace=True)

In [127]:
def pipeline_preprocess(data: pd.DataFrame, flg_evaluate: bool = True, **kwargs):
    """
    Пайплайн по предобработке данных
    :param data: датасет
    :param flg_evaluate: флаг для evaluate
    :return: датасет
    """
    # drop columns
    data = data.drop(kwargs["drop_columns"], axis=1, errors="ignore")
    # проверка dataset на совпадение с признаками из train
    # либо сохранение уникальных данных с признаками из train
    if flg_evaluate:
        data = check_columns_evaluate(
            data=data, unique_values_path=kwargs["unique_values_path"]
        )
    else:
        save_unique_train_data(
            data=data,
            drop_columns=kwargs["drop_columns"],
            target_column=kwargs["target_column"],
            unique_values_path=kwargs["unique_values_path"],
        )

    # replace values
    data = replace_values(data=data, map_change_columns=kwargs["map_change_columns"])
    

    # добавление вида нефтепродукта
    data['type_oil'] = data.PROD_NAME.map(preproc['type_oil']) 

    assert isinstance(
        kwargs["map_bins_columns"], dict
    ), "Подайте тип данных для бинаризации в формате dict"
    # bins
    for key in kwargs["map_bins_columns"].keys():
        data[f"{key}_bins"] = data[key].apply(
            lambda x: get_bins(
                x,
                first_val=kwargs["map_bins_columns"][key][0],
                second_val=kwargs["map_bins_columns"][key][1],
            )
        )

    # change category types
    dict_category = {key: "category" for key in data.select_dtypes(["object"]).columns}
    data = transform_types(data=data, change_type_columns=dict_category)
    return data

In [134]:
data_proc_test = pipeline_preprocess(data=data_test, **preproc)

In [135]:
data_test

Unnamed: 0,AZS_NAME,PROD_NAME,cnt_ks,cnt_tank,cnt_trk,cnt_hand,region,location,year,month,day,day_of_week
0,AZS 557,gasoline_ai_92,2,1,4,4,Vologda_region,federal_highway,2022,4,2,Saturday
1,AZS 557,gasoline_ai_95,2,1,4,4,Vologda_region,federal_highway,2022,4,2,Saturday
2,AZS 557,diesel_fuel,2,1,4,4,Vologda_region,federal_highway,2022,4,2,Saturday
3,AZS 557,diesel_fuel_taneco,2,1,4,4,Vologda_region,federal_highway,2022,4,2,Saturday
4,AZS 557,gasoline_ai_92,3,1,4,4,Vologda_region,federal_highway,2022,4,3,Sunday
...,...,...,...,...,...,...,...,...,...,...,...,...
8000,AZS 573,gasoline_ai_95,2,1,4,4,Arkhangelsk_region,"city_0,25_0,5",2022,12,30,Friday
8001,AZS 573,diesel_fuel,2,1,4,4,Arkhangelsk_region,"city_0,25_0,5",2022,12,30,Friday
8002,AZS 573,gasoline_ai_92,2,2,4,8,Arkhangelsk_region,"city_0,25_0,5",2022,12,31,Saturday
8003,AZS 573,gasoline_ai_95,2,1,4,4,Arkhangelsk_region,"city_0,25_0,5",2022,12,31,Saturday


# 3. Evaluate  

In [136]:
data_proc_test.head(2)

Unnamed: 0,AZS_NAME,PROD_NAME,cnt_ks,cnt_tank,cnt_trk,region,location,year,month,day,day_of_week,type_oil,cnt_trk_bins
0,AZS 557,gasoline_ai_92,more_1_kassa,1,4,Vologda_region,highway,2022,4,2,Saturday,gasoline,3_and_4
1,AZS 557,gasoline_ai_95,more_1_kassa,1,4,Vologda_region,highway,2022,4,2,Saturday,gasoline,3_and_4


In [137]:
model = joblib.load(training['model_path'])
data_proc_test['predict'] = model.predict(data_proc_test)

In [138]:
data_proc_test.head(3)

Unnamed: 0,AZS_NAME,PROD_NAME,cnt_ks,cnt_tank,cnt_trk,region,location,year,month,day,day_of_week,type_oil,cnt_trk_bins,predict
0,AZS 557,gasoline_ai_92,more_1_kassa,1,4,Vologda_region,highway,2022,4,2,Saturday,gasoline,3_and_4,2.770172
1,AZS 557,gasoline_ai_95,more_1_kassa,1,4,Vologda_region,highway,2022,4,2,Saturday,gasoline,3_and_4,3.697468
2,AZS 557,diesel_fuel,more_1_kassa,1,4,Vologda_region,highway,2022,4,2,Saturday,diesel,3_and_4,1.168033
