# Климат (кейс МППО 2023-24)


# Библиотеки, Функции, Классы

In [None]:
import re
import os
import optuna
import prophet
import warnings
import lightgbm
import numpy as np
import pandas as pd
import dill as pickle
import matplotlib.pyplot as plt
import optuna.integration.lightgbm as lgb
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

from tqdm import tqdm
from prophet import Prophet
from typing import Optional
from itertools import product
from lightgbm import LGBMRegressor
# from etna.datasets import TSDataset
from datetime import datetime, timedelta
from holidays.holiday_base import HolidayBase
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error


def create_new_data():
    
    # Позиции разделителей
    sep_param=[5, 5, 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 5, 2, 2, 
               3, 2, 3, 2, 3, 2, 4, 2, 3, 2, 2, 3, 2, 2, 7, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 2, 6, 2,
               6, 2, 6, 2, 6, 2, 6, 2, 2, 4, 2, 8, 2, 2, 6, 2, 7, 2, 7, 2, 3, 2, 5, 2, 2]

    count = 1
    for name_file in os.listdir('/Users/andreyboriskin/PycharmProjects/predprof/dat_files'):
        folder = '/Users/andreyboriskin/PycharmProjects/predprof/new_dat_files/'
        fd_filename = '/Users/andreyboriskin/PycharmProjects/predprof/dat_files/' + name_file
        fn_new = open(f'{folder + name_file[:5]}.csv', 'w', encoding='utf-8')
        for line in open(fd_filename, encoding='utf-8', errors='ignore'):
            line_new = line[:sep_param[0]]
            item_position = sep_param[0]
            for i in range(1,len(sep_param)-1):
                line_new += ',' + line[item_position+1:item_position+sep_param[i]]
                item_position += sep_param[i]
            fn_new.write(line_new + '\n')
        fn_new.close()
        print(f'/r{count}/405, {name_file[:5]}.csv создан', end='')
        count += 1

        
def create_df(file_name: str) -> pd.core.frame.DataFrame:
    folder = '/Users/andreyboriskin/PycharmProjects/predprof/new_dat_files/'
    
    df = pd.read_csv(folder + file_name, sep=',', header=None, low_memory=False)
    new_name = dict()
    for i in range(len(df1.comment[:90])):
        new_name[i] = df1.comment[i]
    df = df.rename(columns=new_name)
    
    return df


def list_remove(ls, ls_remove):
    for i in ls_remove:
        ls.remove(i)
        
    return ls


def root_mean_squared_error(act, pred):
    diff = pred - act # находим разницу между прогнозируемыми и наблюдаемыми значениями
    differences_squared = diff ** 2 # возводим в квадрат
    mean_diff = differences_squared.mean() # находим среднее значение
    rmse_val = np.sqrt(mean_diff) # извлекаем квадратный корень
    
    return rmse_val


def assign_ethnicity(df):
    mn = []
    for i in range(len(df)):
        mn.append(f"{df.iloc[:, [1]].loc[i][0]}-{df.iloc[:, [2]].loc[i][0]}-{df.iloc[:, [3]].loc[i][0]}")
        
    return mn


def create_massive_df(filename: str) -> pd.DataFrame:
    
    df = pd.read_csv(f'/Users/andreyboriskin/PycharmProjects/predprof/new_dat_files/{filename}', header=None)

    if df.shape[0] == 166552:
    
        df = df.drop(df.loc[df[4] != 15].index)
        df.index = range(df.shape[0])
        df[90] = assign_ethnicity(df)
        
        return df.loc[:, [0, 90, 41, 47, 59, 75]]


class suppress_stdout_stderr:
    '''
    A context manager for doing a "deep suppression" of stdout and stderr in
    Python, i.e. will suppress all print, even if the print originates in a
    compiled C/Fortran sub-function.
       This will not suppress raised exceptions, since exceptions are printed
    to stderr just before a script exits, and after the context manager has
    exited (at least, I think that is why it lets exceptions through).

    '''
    def __init__(self):
        # Open a pair of null files
        self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
        # Save the actual stdout (1) and stderr (2) file descriptors.
        self.save_fds = [os.dup(1), os.dup(2)]

    def __enter__(self):
        # Assign the null pointers to stdout and stderr.
        os.dup2(self.null_fds[0], 1)
        os.dup2(self.null_fds[1], 2)

    def __exit__(self, *_):
        # Re-assign the real stdout/stderr back to (1) and (2)
        os.dup2(self.save_fds[0], 1)
        os.dup2(self.save_fds[1], 2)
        # Close the null files
        for fd in self.null_fds + self.save_fds:
            os.close(fd)
            
            

class OptunaLGBMRegressor:
    """
    A wrapper class for the LightGBM Regressor with Optuna for hyperparameters tuning
    """

    def __init__(
        self,
        n_estimators: int,
        learning_rate: float = 0.01,
        metric: str = 'rmse',
        cat_columns: str = 'auto',
        seed: int = 42
    ):
        """
        Initializes a new instance of the OptunaLGBMRegressor class
        """
        self.params = {
            "n_estimators": n_estimators,
            "objective": "regression",
            "verbosity": -1,
            "metric": metric,
            "learning_rate": learning_rate,
            "boosting_type": 'gbdt',
            "random_state": seed
        }
        self.cat_columns = cat_columns
        self.model = None
        self.features = None
        self.is_fitted_ = False

    def _to_datasets(
        self, x_train: pd.DataFrame, y_train: np.ndarray, x_val: pd.DataFrame, y_val: np.ndarray
    ) -> (lightgbm.Dataset, lightgbm.Dataset):
        """
        Converts Pandas DataFrames to LightGBM Datasets
        """
        self.features = list(x_train.columns)
        X_val = x_val[self.features].copy()
        dtrain = lightgbm.Dataset(x_train, label=y_train, categorical_feature=self.cat_columns)
        dval = lightgbm.Dataset(X_val, label=y_val, categorical_feature=self.cat_columns)

        return dtrain, dval     

    def fit(self, X_train: pd.DataFrame, y_train: np.ndarray, X_val: pd.DataFrame, y_val: np.ndarray) -> None:
        dtrain, dval = self._to_datasets(X_train, y_train, X_val, y_val)
        
        self.model = lgb.tuner.train(
            self.params,
            dtrain,
            valid_sets=[dtrain, dval],
        )
        
        self.is_fitted_ = True

    def predict(self, X_test: pd.DataFrame) -> np.ndarray:
        assert self.is_fitted_, 'Model is not fitted!'
        return self.model.predict(X_test[self.features], num_iteration=self.model.best_iteration)


class ProphetsEnsemble:
    """An ensemble of Prophet models with different aggregation functions and frequencies."""

    def __init__(self, freq: str, levels: list, agg_fn: list, holidays_getter: HolidayBase = None):
        """Initializes an ensemble of Prophet models."""
        self.freq = freq
        self.levels = ['_'.join(x) for x in product(levels, agg_fn)]
        self.h_getter = holidays_getter
        self.prophets_ = dict()
        self.is_fitted_ = False
    
    @staticmethod
    def _resample(data: pd.DataFrame, freq: str, how: str) -> pd.DataFrame:
        """Resamples a time series DataFrame."""
        if how not in ['median', 'mean', 'sum']:
            raise NotImplementedError(f'Unknown function {how}. Only [median, mean, sum] are supported.') 
        return data.set_index('ds').resample(freq).agg(how).reset_index(drop=False)

    @staticmethod
    def _merge_key_gen(x, level: str) -> str:
        """Generates a key for merging DataFrames based on the frequency."""
        freq = re.sub('[\d]', '', level.split('_')[0])
        if freq == 'H':
            return f'{x.year}-{x.month}-{x.day}-{x.hour}'
        elif freq in ['D', 'M']:
            return f'{x.year}-{x.month}-{x.day}' if freq == 'D' else f'{x.year}-{x.month}'
        elif freq == 'W':
            return f'{x.isocalendar().year}-{x.isocalendar().week}'
        raise NotImplementedError(f'Only [H, D, W, M] are supported. {freq} was recieved as input!')
    
    def _get_holidays(self, data: pd.DataFrame) -> Optional[pd.DataFrame]:
        """Extracts holidays from the data."""
        if self.h_getter is None:
            return None
        holidays = data[['ds']].copy()
        holidays['holiday'] = holidays['ds'].apply(self.h_getter.get)
        return holidays.dropna()
    
    def _fit_level(self, data: pd.DataFrame, level: str) -> None:
        """Fits a Prophet model for a specific aggregation level."""
        resampled = self._resample(data, *level.split('_')) if level != self.freq else data.copy()
        fb = Prophet(holidays=self._get_holidays(resampled))
        with suppress_stdout_stderr():
            fb.fit(resampled)
        self.prophets_[level] = fb
        
    def _predict_level(self, periods: int, level: str) -> pd.DataFrame:
        """Makes predictions for a specific aggregation level."""
        fb = self.prophets_[level]
        df = fb.make_future_dataframe(periods=periods, freq=level.split('_')[0])
        forecasts = fb.predict(df)
        forecasts.columns = [f'{x}_{level}' for x in forecasts.columns]
        return forecasts
    
    def _combine_levels(self, base_df: pd.DataFrame, data: pd.DataFrame, level: str) -> pd.DataFrame:
        """Combines predictions from different aggregation levels."""
        key = lambda x: self._merge_key_gen(x, level)
        return (
            base_df.assign(key=base_df['ds'].apply(key))
            .merge(data.assign(key=data[f'ds_{level}'].apply(key)), on='key', how='left')
            .drop(['key', f'ds_{level}'], axis=1)
        )
    
    @staticmethod
    def _drop_redundant(data: pd.DataFrame) -> pd.DataFrame:
        """Drops redundant features from the DataFrame."""
        redundant = [col for col in data.columns if col != 'ds' and 'yhat' not in col and len(data[col].unique()) == 1]
        return data.drop(redundant, axis=1)
    
    def fit(self, data: pd.DataFrame) -> None:
        """Fits the Prophet models for all aggregation levels."""
        for level in tqdm([self.freq] + self.levels, 'Fitting prophets...'):
            self._fit_level(data, level)
        self.is_fitted_ = True
            
    def forecast(self, periods: int) -> pd.DataFrame:
        """Makes forecasts for all aggregation levels and combines them."""
        assert self.is_fitted_, 'Model is not fitted'
        forecasts = [self._predict_level(periods, level) for level in tqdm([self.freq] + self.levels, 'Forecasting...')]
        
        forecast = forecasts[0].rename(columns={f'ds_{self.freq}': 'ds'})
        for level, fore in zip(self.levels, forecasts[1:]):
            forecast = self._combine_levels(forecast, fore, level)
            
        return self._drop_redundant(forecast)

# Создание общего DataFrame

In [167]:
# filelink = 'Srok8c.ddl'
# ddl_data = open(filelink,'r', encoding='cp1251').read()
# # Сохраняем в кодировке utf-8
# fn = open('Srok8c.ddl','w', encoding='utf-8')
# fn.write(ddl_data)
# fn.close()

In [2]:
dt1 = open('Srok8c.ddl','r').read() 
dt1 = dt1[dt1.find('KEY'):dt1.rfind('\n')].split('\n')
dt2 = []
for i in range(len(dt1)):
  if dt1[i][:2] != '//' and dt1[i][1:3] != '//' and dt1[i][2:4] != '//':
    tmp = dt1[i].split()
    if tmp[4] == 'NA;':
      tmp[3] += ';'
      tmp.pop(4)
    tmp[2] = tmp[2][3:-1]
    if tmp[2].count(',') > 0:
      tmp[2] = list(map(int, tmp[2].split(',')))[0]
    tmp[3] = tmp[3][3:-2]
    tmp[4] = ''
    for j in range(5, len(tmp)):
      tmp[4] += tmp[j] + ' '
    tmp[4] = tmp[4][:-1]
    dt2.append(tmp[:5])
df1 = pd.DataFrame(dt2,columns=['keys', 'name', 'fa', 'pc', 'comment'])
df1.fa = df1.fa.astype('int')

In [3]:
df = create_df('27612.csv')
df.drop(['Признак качества'], axis=1, inplace=True)

In [4]:
df = df.drop(df.loc[df['Срок по Гринвичу'] != 15].index)
df.index = range(df.shape[0])
df['date'] = assign_ethnicity(df)
df['date'] = pd.to_datetime(df.date)

# Создание временного ряда с помощью модели улучшенной Prophet 

## Температура

In [64]:
df_te = df.copy()
df_te = df_te.rename(columns={'Температура воздуха по сухому терм-ру': 'y'})
df_te.drop(df_te.loc[df_te['y'] == '     '].index, inplace=True)
df_te['y'] = df_te['y'].astype('float64')
data = pd.DataFrame(columns=['ds', 'y'])
data['ds'] = df_te.date
data['y'] = df_te.y

In [65]:
train_series = data # [data.ds < (data.ds.max() - timedelta(days=30))]
# test_series = data[data.ds >= (data.ds.max() - timedelta(days=30))].drop(['y'], axis=1)
start_date = '2022-12-31'
end_date = '2023-01-07'

start = datetime.strptime(start_date, '%Y-%m-%d')
end = datetime.strptime(end_date, '%Y-%m-%d')   

daterange = [(start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(0, (end-start).days)]
new_test_series = pd.DataFrame(pd.to_datetime(daterange), columns=['ds'])

In [66]:
fb_te = prophet.Prophet()

with suppress_stdout_stderr():
    fb_te.fit(train_series)

predictions = fb_te.make_future_dataframe(periods=len(new_test_series), freq='D')
forecast = fb_te.predict(predictions)

# v_fb_df = test_series.copy()
# v_fb_df = v_fb_df.merge(forecast[['ds', 'yhat']], on='ds', how='left')

20:34:09 - cmdstanpy - INFO - Chain [1] start processing
20:34:10 - cmdstanpy - INFO - Chain [1] done processing


In [67]:
filename = 'fb_model_te.h5py'
with open('/Users/andreyboriskin/PycharmProjects/predprof/models/model_te/' + filename, 'wb') as file:
	pickle.dump(fb_te, file)

In [69]:
from sklearn.model_selection import train_test_split


gbt_data = train_series.merge(forecast, on='ds', how='left')
train_gbt, val_gbt = train_test_split(gbt_data, test_size=0.15, random_state=42)

lgbm_te = OptunaLGBMRegressor(n_estimators=300, learning_rate=0.01, metric='mape', seed=42)

lgbm_te.fit(
    train_gbt.drop(['ds', 'y'], axis=1), 
    train_gbt.y.values,
    val_gbt.drop(['ds', 'y'], axis=1),
    val_gbt.y.values
)

test_gbt = new_test_series.merge(forecast, on='ds', how='left')
preds = lgbm_te.predict(test_gbt.drop(['ds'], axis=1)) # preds = lgbm.predict(test_gbt.drop(['ds', 'y'], axis=1))

forecast_df = test_gbt[['ds']].copy() # forecast_df = test_gbt[['ds', 'y', 'yhat']].copy()
forecast_df['gbt_yhat'] = preds

# forecast_df['gbt_yhat'] - предикт модели

[I 2024-02-06 20:35:36,508] A new study created in memory with name: no-name-e421168c-2826-4988-a4eb-faa35c48140d

  0%|                                                     | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: inf:   0%|                   | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: 0.822806:   0%|              | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: 0.822806:  14%|8     | 1/7 [00:00<00:03,  1.53it/s][A[I 2024-02-06 20:35:37,164] Trial 0 finished with value: 0.8228058789572678 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.8228058789572678.

feature_fraction, val_score: 0.822806:  14%|8     | 1/7 [00:00<00:03,  1.53it/s][A
feature_fraction, val_score: 0.822132:  14%|8     | 1/7 [00:01<00:03,  1.53it/s][A
feature_fraction, val_score: 0.822132:  29%|#7    | 2/7 [00:01<00:03,  1.47it/s][A[I 2024-02-06 20:35:37,861] Trial 1 finished with value: 0.8221324983878934 and parameters: {'feature_fraction': 0.7}. Best is tria

In [70]:
filename = 'model_te.h5py'

In [71]:
with open('/Users/andreyboriskin/PycharmProjects/predprof/models/model_te/' + filename, 'wb') as file:
	pickle.dump(lgbm_te, file)

In [None]:
with open('/Users/andreyboriskin/PycharmProjects/predprof/models/' + filename ,'rb') as f:
    loaded_model = pickle.load(f)

In [61]:
loaded_model.predict(test_gbt)

array([-1.2584451 , -1.37954306, -1.22268945, -2.03356601, -1.26723486,
       -2.82087916, -2.62727146])

## Осадки

In [72]:
df_os = df.copy()
df_os = df_os.rename(columns={'Сумма осадков': 'y'})
df_os.drop(df_os.loc[df_os['y'] == '      '].index, inplace=True)
df_os['y'] = df_os['y'].astype('float64')
data = pd.DataFrame(columns=['ds', 'y'])
data['ds'] = df_os.date
data['y'] = df_os.y

In [73]:
train_series = data # [data.ds < (data.ds.max() - timedelta(days=30))]
# test_series = data[data.ds >= (data.ds.max() - timedelta(days=30))].drop(['y'], axis=1)
start_date = '2022-12-31'
end_date = '2023-01-07'

start = datetime.strptime(start_date, '%Y-%m-%d')
end = datetime.strptime(end_date, '%Y-%m-%d')   

daterange = [(start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(0, (end-start).days)]
new_test_series = pd.DataFrame(pd.to_datetime(daterange), columns=['ds'])

In [74]:
fb_os = prophet.Prophet()

with suppress_stdout_stderr():
    fb_os.fit(train_series)

predictions = fb_os.make_future_dataframe(periods=len(new_test_series), freq='D')
forecast = fb_os.predict(predictions)

# v_fb_df = test_series.copy()
# v_fb_df = v_fb_df.merge(forecast[['ds', 'yhat']], on='ds', how='left')

20:37:56 - cmdstanpy - INFO - Chain [1] start processing
20:37:58 - cmdstanpy - INFO - Chain [1] done processing


In [75]:
filename = 'fb_model_os.h5py'
with open('/Users/andreyboriskin/PycharmProjects/predprof/models/model_os/' + filename, 'wb') as file:
	pickle.dump(fb_os, file)

In [76]:
from sklearn.model_selection import train_test_split


gbt_data = train_series.merge(forecast, on='ds', how='left')
train_gbt, val_gbt = train_test_split(gbt_data, test_size=0.15, random_state=42)

lgbm_os = OptunaLGBMRegressor(n_estimators=300, learning_rate=0.01, metric='mape', seed=42)

lgbm_os.fit(
    train_gbt.drop(['ds', 'y'], axis=1), 
    train_gbt.y.values,
    val_gbt.drop(['ds', 'y'], axis=1),
    val_gbt.y.values
)

test_gbt = new_test_series.merge(forecast, on='ds', how='left')
preds = lgbm_os.predict(test_gbt.drop(['ds'], axis=1)) # preds = lgbm.predict(test_gbt.drop(['ds', 'y'], axis=1))

forecast_df = test_gbt[['ds']].copy() # forecast_df = test_gbt[['ds', 'y', 'yhat']].copy()
forecast_df['gbt_yhat'] = preds

# forecast_df['gbt_yhat'] - предикт модели

[I 2024-02-06 20:38:00,763] A new study created in memory with name: no-name-cf8b3281-0246-4300-aeef-798fbfb14282

  0%|                                                     | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: inf:   0%|                   | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: 0.470755:   0%|              | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: 0.470755:  14%|8     | 1/7 [00:00<00:02,  2.05it/s][A[I 2024-02-06 20:38:01,255] Trial 0 finished with value: 0.4707550201916411 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.4707550201916411.

feature_fraction, val_score: 0.470755:  14%|8     | 1/7 [00:00<00:02,  2.05it/s][A
feature_fraction, val_score: 0.470755:  14%|8     | 1/7 [00:00<00:02,  2.05it/s][A
feature_fraction, val_score: 0.470755:  29%|#7    | 2/7 [00:00<00:02,  2.10it/s][A[I 2024-02-06 20:38:01,722] Trial 1 finished with value: 0.4719807070947092 and parameters: {'feature_fraction': 0.8}. Best is tria

In [77]:
filename = 'model_os.h5py'

In [78]:
with open('/Users/andreyboriskin/PycharmProjects/predprof/models/model_os/' + filename, 'wb') as file:
	pickle.dump(lgbm_os, file)

## Влажность

In [79]:
df_wl = df.copy()
df_wl = df_wl.rename(columns={'Относительная влажность воздуха': 'y'})
df_wl.drop(df_wl.loc[df_wl['y'] == '   '].index, inplace=True)
df_wl['y'] = df_wl['y'].astype('int64')
data = pd.DataFrame(columns=['ds', 'y'])
data['ds'] = df_wl.date
data['y'] = df_wl.y

In [80]:
train_series = data # [data.ds < (data.ds.max() - timedelta(days=30))]
# test_series = data[data.ds >= (data.ds.max() - timedelta(days=30))].drop(['y'], axis=1)
start_date = '2022-12-31'
end_date = '2023-01-07'

start = datetime.strptime(start_date, '%Y-%m-%d')
end = datetime.strptime(end_date, '%Y-%m-%d')   

daterange = [(start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(0, (end-start).days)]
new_test_series = pd.DataFrame(pd.to_datetime(daterange), columns=['ds'])

In [82]:
fb_wl = prophet.Prophet()

with suppress_stdout_stderr():
    fb_wl.fit(train_series)

predictions = fb_wl.make_future_dataframe(periods=len(new_test_series), freq='D')
forecast = fb_wl.predict(predictions)

# v_fb_df = test_series.copy()
# v_fb_df = v_fb_df.merge(forecast[['ds', 'yhat']], on='ds', how='left')

20:41:16 - cmdstanpy - INFO - Chain [1] start processing
20:41:16 - cmdstanpy - INFO - Chain [1] done processing


In [83]:
filename = 'fb_model_wl.h5py'
with open('/Users/andreyboriskin/PycharmProjects/predprof/models/model_wl/' + filename, 'wb') as file:
	pickle.dump(fb_wl, file)

In [84]:
from sklearn.model_selection import train_test_split


gbt_data = train_series.merge(forecast, on='ds', how='left')
train_gbt, val_gbt = train_test_split(gbt_data, test_size=0.15, random_state=42)

lgbm_wl = OptunaLGBMRegressor(n_estimators=300, learning_rate=0.01, metric='mape', seed=42)

lgbm_wl.fit(
    train_gbt.drop(['ds', 'y'], axis=1), 
    train_gbt.y.values,
    val_gbt.drop(['ds', 'y'], axis=1),
    val_gbt.y.values
)

test_gbt = new_test_series.merge(forecast, on='ds', how='left')
preds = lgbm_wl.predict(test_gbt.drop(['ds'], axis=1)) # preds = lgbm.predict(test_gbt.drop(['ds', 'y'], axis=1))

forecast_df = test_gbt[['ds']].copy() # forecast_df = test_gbt[['ds', 'y', 'yhat']].copy()
forecast_df['gbt_yhat'] = preds

# forecast_df['gbt_yhat'] - предикт модели

[I 2024-02-06 20:41:19,816] A new study created in memory with name: no-name-09ec0aa4-d230-4bf8-90db-6bed6ddf4e75

  0%|                                                     | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: inf:   0%|                   | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: 0.208844:   0%|              | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: 0.208844:  14%|8     | 1/7 [00:00<00:03,  1.54it/s][A[I 2024-02-06 20:41:20,470] Trial 0 finished with value: 0.20884399567835749 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.20884399567835749.

feature_fraction, val_score: 0.208844:  14%|8     | 1/7 [00:00<00:03,  1.54it/s][A
feature_fraction, val_score: 0.208822:  14%|8     | 1/7 [00:01<00:03,  1.54it/s][A
feature_fraction, val_score: 0.208822:  29%|#7    | 2/7 [00:01<00:03,  1.48it/s][A[I 2024-02-06 20:41:21,160] Trial 1 finished with value: 0.20882228634284475 and parameters: {'feature_fraction': 1.0}. Best is t

In [85]:
filename = 'model_wl.h5py'

In [86]:
with open('/Users/andreyboriskin/PycharmProjects/predprof/models/model_wl/' + filename, 'wb') as file:
	pickle.dump(lgbm_wl, file)

## Ветер

In [87]:
df_vt = df.copy()
df_vt = df_vt.rename(columns={'Средняя скорость ветра': 'y'})
df_vt.drop(df_vt.loc[df_vt['y'] == '  '].index, inplace=True)
df_vt['y'] = df_vt['y'].astype('int64')
data = pd.DataFrame(columns=['ds', 'y'])
data['ds'] = df_vt.date
data['y'] = df_vt.y

In [88]:
train_series = data # [data.ds < (data.ds.max() - timedelta(days=30))]
# test_series = data[data.ds >= (data.ds.max() - timedelta(days=30))].drop(['y'], axis=1)
start_date = '2022-12-31'
end_date = '2023-01-07'

start = datetime.strptime(start_date, '%Y-%m-%d')
end = datetime.strptime(end_date, '%Y-%m-%d')   

daterange = [(start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(0, (end-start).days)]
new_test_series = pd.DataFrame(pd.to_datetime(daterange), columns=['ds'])

In [89]:
fb_vt = prophet.Prophet()

with suppress_stdout_stderr():
    fb_vt.fit(train_series)

predictions = fb_vt.make_future_dataframe(periods=len(new_test_series), freq='D')
forecast = fb_vt.predict(predictions)

# v_fb_df = test_series.copy()
# v_fb_df = v_fb_df.merge(forecast[['ds', 'yhat']], on='ds', how='left')

20:43:19 - cmdstanpy - INFO - Chain [1] start processing
20:43:20 - cmdstanpy - INFO - Chain [1] done processing
num_leaves, val_score: 0.749178:  55%|#####5    | 11/20 [08:19<06:48, 45.40s/it]


In [90]:
filename = 'fb_model_vt.h5py'
with open('/Users/andreyboriskin/PycharmProjects/predprof/models/model_vt/' + filename, 'wb') as file:
	pickle.dump(fb_vt, file)

In [91]:
from sklearn.model_selection import train_test_split


gbt_data = train_series.merge(forecast, on='ds', how='left')
train_gbt, val_gbt = train_test_split(gbt_data, test_size=0.15, random_state=42)

lgbm_vt = OptunaLGBMRegressor(n_estimators=300, learning_rate=0.01, metric='mape', seed=42)

lgbm_vt.fit(
    train_gbt.drop(['ds', 'y'], axis=1), 
    train_gbt.y.values,
    val_gbt.drop(['ds', 'y'], axis=1),
    val_gbt.y.values
)

test_gbt = new_test_series.merge(forecast, on='ds', how='left')
preds = lgbm_vt.predict(test_gbt.drop(['ds'], axis=1)) # preds = lgbm.predict(test_gbt.drop(['ds', 'y'], axis=1))

forecast_df = test_gbt[['ds']].copy() # forecast_df = test_gbt[['ds', 'y', 'yhat']].copy()
forecast_df['gbt_yhat'] = preds

# forecast_df['gbt_yhat'] - предикт модели

[I 2024-02-06 20:43:23,545] A new study created in memory with name: no-name-f4316c2f-6a20-4680-9efc-a2d48da4fbfd
feature_fraction, val_score: 0.548547:  14%|8     | 1/7 [00:00<00:03,  1.60it/s][I 2024-02-06 20:43:24,170] Trial 0 finished with value: 0.548546924616191 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.548546924616191.
feature_fraction, val_score: 0.548314:  29%|#7    | 2/7 [00:01<00:03,  1.56it/s][I 2024-02-06 20:43:24,820] Trial 1 finished with value: 0.5483135361609164 and parameters: {'feature_fraction': 0.8}. Best is trial 1 with value: 0.5483135361609164.
feature_fraction, val_score: 0.548314:  43%|##5   | 3/7 [00:01<00:02,  1.67it/s][I 2024-02-06 20:43:25,369] Trial 2 finished with value: 0.5483808766156537 and parameters: {'feature_fraction': 0.4}. Best is trial 1 with value: 0.5483135361609164.
feature_fraction, val_score: 0.548247:  57%|###4  | 4/7 [00:02<00:01,  1.67it/s][I 2024-02-06 20:43:25,970] Trial 3 finished with value: 0.54824669

In [None]:
filename = 'model_vt.h5py'

with open('/Users/andreyboriskin/PycharmProjects/predprof/models/model_vt/' + filename, 'wb') as file:
	pickle.dump(lgbm_vt, file)