In [114]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from prophet import Prophet

from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import TimeSeriesSplit

import os
os.chdir('..')
import gc
import sys


import datetime
from utils import utils, models

In [3]:
daily_df = pd.read_csv('data/daily_data.csv')

In [17]:
daily_df.tail()

Unnamed: 0,station,date,temp_max,temp_mean,temp_min,rainfall,snow
76667,PALH0,2023-11-18,-6.7,-10.191667,-13.3,False,False
76668,PALH0,2023-11-19,-10.6,-13.8125,-16.1,False,False
76669,PALH0,2023-11-20,-10.6,-16.1625,-18.9,False,False
76670,PALH0,2023-11-21,-4.4,-9.5125,-12.8,False,False
76671,PALH0,2023-11-22,-3.9,-4.911765,-6.4,True,True


In [18]:
# test_date = str(datetime.date.today() - datetime.timedelta(days=4))
test_date = '2023-11-16'
daily = True

In [20]:
daily_df["date"] = pd.to_datetime(daily_df["date"])


In [21]:
train_data_pd, test_data_pd = models.train_test_split(daily_df, test_date, daily)

In [22]:
train_data_pd.tail()

Unnamed: 0,station,date,temp_max,temp_mean,temp_min,rainfall,snow
76660,PALH0,2023-11-11,-0.5,-3.529167,-7.2,True,True
76661,PALH0,2023-11-12,-2.5,-6.104167,-10.0,False,False
76662,PALH0,2023-11-13,-2.2,-4.766667,-6.1,True,True
76663,PALH0,2023-11-14,-3.0,-6.720833,-10.0,False,False
76664,PALH0,2023-11-15,2.2,-3.370833,-6.0,True,True


In [23]:
test_data_pd

Unnamed: 0,station,date,temp_max,temp_mean,temp_min,rainfall,snow
3646,72202,2023-11-16,27.0,23.775000,21.0,True,False
3647,72202,2023-11-17,28.3,25.033333,23.0,True,False
3648,72202,2023-11-18,26.1,23.812500,22.0,True,False
3649,72202,2023-11-19,29.0,24.333333,21.0,False,False
3650,72202,2023-11-20,29.4,25.304167,20.6,False,False
...,...,...,...,...,...,...,...
76667,PALH0,2023-11-18,-6.7,-10.191667,-13.3,False,False
76668,PALH0,2023-11-19,-10.6,-13.812500,-16.1,False,False
76669,PALH0,2023-11-20,-10.6,-16.162500,-18.9,False,False
76670,PALH0,2023-11-21,-4.4,-9.512500,-12.8,False,False


# Prophet training

### First for one station 

In [56]:
train_1 = train_data_pd.loc[train_data_pd.station == '72202']
test_1 = test_data_pd.loc[test_data_pd.station == '72202']

In [57]:
train_1 = pd.DataFrame(train_1.rename(columns={'date': 'ds', 'temp_min': 'y'}))
test_1 = pd.DataFrame(test_1.rename(columns={'date': 'ds', 'temp_min': 'y'}))

### Adding regressors

In [26]:
model = Prophet()

In [27]:
train_1.columns

Index(['station', 'ds', 'temp_max', 'temp_mean', 'y', 'rainfall', 'snow'], dtype='object')

In [28]:
model.add_regressor(name='temp_max')
model.add_regressor(name='temp_mean')
model.add_regressor(name='rainfall')
model.add_regressor(name='snow')

<prophet.forecaster.Prophet at 0x7f8b43a2ed40>

In [29]:
model.fit(train_1[['ds', 'y', 'temp_max', 'temp_mean', 'rainfall', 'snow']])

10:27:27 - cmdstanpy - INFO - Chain [1] start processing
10:27:27 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x7f8b43a2ed40>

In [31]:
future = model.make_future_dataframe(periods=7)
future = future.merge(test_1[['ds', 'temp_max', 'temp_mean', 'rainfall', 'snow']], on='ds') 

In [32]:
forecast_1 = model.predict(future)

In [33]:
eval = (pd.DataFrame(test_1[['ds', 'y']])
              .merge(forecast_1[['ds', 'yhat']], on='ds')
              )

In [35]:
eval.head()

Unnamed: 0,ds,y,yhat
0,2023-11-16,21.0,20.86413
1,2023-11-17,23.0,22.177245
2,2023-11-18,22.0,21.341177
3,2023-11-19,21.0,20.522439
4,2023-11-20,20.6,21.985756


In [37]:
np.sqrt(mean_squared_error(eval.y, eval.yhat))

0.7565998817695939

#### Avg temp

In [58]:
train_1 = pd.DataFrame(train_1.rename(columns={'date': 'ds', 'y': 'temp_min'})).rename(columns={'temp_mean': 'y'})
test_1 = pd.DataFrame(test_1.rename(columns={'date': 'ds', 'y': 'temp_min'})).rename(columns={'temp_mean': 'y'})

In [62]:
model = Prophet()
model.add_regressor(name='temp_max')
model.add_regressor(name='temp_min')
model.add_regressor(name='rainfall')
model.add_regressor(name='snow')

model.fit(train_1[['ds', 'y', 'temp_max', 'temp_min', 'rainfall', 'snow']])

future = model.make_future_dataframe(periods=7)
future = future.merge(test_1[['ds', 'temp_max', 'temp_min', 'rainfall', 'snow']], on='ds')
forecast_1 = model.predict(future)


10:38:52 - cmdstanpy - INFO - Chain [1] start processing
10:38:53 - cmdstanpy - INFO - Chain [1] done processing


In [63]:
eval = (pd.DataFrame(test_1[['ds', 'y']])
              .merge(forecast_1[['ds', 'yhat']], on='ds')
              )

eval.head()
np.sqrt(mean_squared_error(eval.y, eval.yhat))

0.376564034237821

In [64]:
eval.head()

Unnamed: 0,ds,y,yhat
0,2023-11-16,23.775,23.860366
1,2023-11-17,25.033333,25.458914
2,2023-11-18,23.8125,24.021066
3,2023-11-19,24.333333,24.755379
4,2023-11-20,25.304167,24.657423


In [65]:
train_1 = pd.DataFrame(train_1.rename(columns={'date': 'ds', 'y': 'temp_mean'})).rename(columns={'temp_max': 'y'})
test_1 = pd.DataFrame(test_1.rename(columns={'date': 'ds', 'y': 'temp_mean'})).rename(columns={'temp_max': 'y'})

In [66]:
model = Prophet()
model.add_regressor(name='temp_mean')
model.add_regressor(name='temp_min')
model.add_regressor(name='rainfall')
model.add_regressor(name='snow')

model.fit(train_1[['ds', 'y', 'temp_mean', 'temp_min', 'rainfall', 'snow']])

future = model.make_future_dataframe(periods=7)
future = future.merge(test_1[['ds', 'temp_mean', 'temp_min', 'rainfall', 'snow']], on='ds')

forecast_1 = model.predict(future)

eval = (test_1
        .merge(forecast_1[['ds', 'yhat', 'yhat_lower','yhat_upper']], on='ds')
              )

np.sqrt(mean_squared_error(eval.y, eval.yhat))

10:39:44 - cmdstanpy - INFO - Chain [1] start processing
10:39:45 - cmdstanpy - INFO - Chain [1] done processing


0.5742560326199894

In [67]:
eval

Unnamed: 0,station,ds,y,temp_mean,temp_min,rainfall,snow,yhat,yhat_lower,yhat_upper
0,72202,2023-11-16,27.0,23.775,21.0,True,False,27.12319,26.009474,28.065607
1,72202,2023-11-17,28.3,25.033333,23.0,True,False,27.816811,26.668746,28.825116
2,72202,2023-11-18,26.1,23.8125,22.0,True,False,26.547128,25.420279,27.591037
3,72202,2023-11-19,29.0,24.333333,21.0,False,False,28.039737,26.975185,29.071069
4,72202,2023-11-20,29.4,25.304167,20.6,False,False,29.809738,28.747841,30.854994
5,72202,2023-11-21,28.3,26.345833,24.4,False,False,29.033973,27.979343,30.055078
6,72202,2023-11-22,28.9,25.5,23.3,False,False,28.419257,27.475474,29.479997


In [68]:
train_1 = pd.DataFrame(train_1.rename(columns={'date': 'ds', 'y': 'temp_max'})).rename(columns={'snow': 'y'})
test_1 = pd.DataFrame(test_1.rename(columns={'date': 'ds', 'y': 'temp_max'})).rename(columns={'snow': 'y'})


model = Prophet()
model.add_regressor(name='temp_mean')
model.add_regressor(name='temp_min')
model.add_regressor(name='rainfall')
model.add_regressor(name='temp_max')

model.fit(train_1[['ds', 'y', 'temp_mean', 'temp_min', 'rainfall', 'temp_max']])

future = model.make_future_dataframe(periods=7)
future = future.merge(test_1[['ds', 'temp_mean', 'temp_min', 'rainfall', 'temp_max']], on='ds')
forecast_1 = model.predict(future)
eval = (test_1
        .merge(forecast_1[['ds', 'yhat', 'yhat_lower','yhat_upper']], on='ds')
              )

np.sqrt(mean_squared_error(eval.y, eval.yhat))

0.0

## Fit a model on all stations?

In [70]:
train_data_pd.head()

Unnamed: 0,station,date,temp_max,temp_mean,temp_min,rainfall,snow
0,72202,2013-11-22,26.7,25.957143,24.4,True,False
1,72202,2013-11-23,27.2,25.033333,23.9,True,False
2,72202,2013-11-24,28.3,24.620833,21.7,False,False
3,72202,2013-11-25,25.6,23.179167,21.7,True,False
4,72202,2013-11-26,27.2,25.016667,22.8,True,False


In [72]:
train_df = pd.get_dummies(train_data_pd, columns=['station'])
test_df = pd.get_dummies(test_data_pd, columns=['station'])

In [78]:
train_df = pd.DataFrame(train_df.rename(columns={'date': 'ds', 'temp_min' : 'y'}))
test_df = pd.DataFrame(test_df.rename(columns={'date': 'ds',  'temp_min': 'y'}))

In [73]:
model = Prophet()
model.add_regressor(name='temp_mean')
model.add_regressor(name='snow')
model.add_regressor(name='rainfall')
model.add_regressor(name='temp_max')

<prophet.forecaster.Prophet at 0x7f8b41ab93c0>

In [75]:
for col in test_df.columns:
    if 'station' in col:
        model.add_regressor(name=col)

In [80]:

model.fit(train_df)

10:48:43 - cmdstanpy - INFO - Chain [1] start processing
10:49:36 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x7f8b41ab93c0>

In [82]:
future = model.make_future_dataframe(periods=7)
future = future.merge(test_df.drop('y', axis=1), on='ds')

forecast_1 = model.predict(future)

In [83]:
eval_model = (test_df
        .merge(forecast_1[['ds', 'yhat']], on='ds')
              )

In [88]:
np.sqrt(mean_squared_error(eval_model.y, eval_model.yhat))

11.80841528875813

In [89]:
station_ids = daily_df.station.unique()

In [92]:
stations = dict(zip(range(len(station_ids)), station_ids))

In [96]:
models_min = dict()

In [102]:
def model_predictor(y: str, X: list, train: pd.DataFrame, 
                    test: pd.DataFrame, stations: dict, days: int = 7):
    
    models = dict()
    for i, station in stations.items():
        train_1 = train.loc[train.station == station]
        test_1 = test.loc[test.station == station]

        train_1 = pd.DataFrame(train_1.rename(columns={'date': 'ds', y: 'y'}))
        test_1 = pd.DataFrame(test_1.rename(columns={'date': 'ds', y: 'y'}))

        models[i] = Prophet()
        for x in X:
            models[i].add_regressor(name=x)

        models[i].fit(train_1[['ds', 'y'] + X])

        future = models[i].make_future_dataframe(periods=days)
        future = future.merge(test_1[['ds'] + X], on='ds') 

        forecast_1 = models[i].predict(future)

        eval_m = (pd.DataFrame(test_1[['ds', 'y']])
                    .merge(forecast_1[['ds', 'yhat']], on='ds')
                    )

        print(f"MSE = {np.sqrt(mean_squared_error(eval_m.y, eval_m.yhat))}")


    return models

In [103]:
model_min = model_predictor('temp_min', 
                            ['temp_max', 'temp_mean', 'rainfall', 'snow'], 
                            train_data_pd, test_data_pd, stations)

11:06:22 - cmdstanpy - INFO - Chain [1] start processing
11:06:22 - cmdstanpy - INFO - Chain [1] done processing
11:06:22 - cmdstanpy - INFO - Chain [1] start processing
11:06:23 - cmdstanpy - INFO - Chain [1] done processing
11:06:23 - cmdstanpy - INFO - Chain [1] start processing
11:06:23 - cmdstanpy - INFO - Chain [1] done processing
11:06:24 - cmdstanpy - INFO - Chain [1] start processing
11:06:25 - cmdstanpy - INFO - Chain [1] done processing
11:06:25 - cmdstanpy - INFO - Chain [1] start processing
11:06:25 - cmdstanpy - INFO - Chain [1] done processing
11:06:26 - cmdstanpy - INFO - Chain [1] start processing
11:06:26 - cmdstanpy - INFO - Chain [1] done processing
11:06:26 - cmdstanpy - INFO - Chain [1] start processing
11:06:27 - cmdstanpy - INFO - Chain [1] done processing
11:06:27 - cmdstanpy - INFO - Chain [1] start processing
11:06:28 - cmdstanpy - INFO - Chain [1] done processing
11:06:28 - cmdstanpy - INFO - Chain [1] start processing
11:06:28 - cmdstanpy - INFO - Chain [1]

MSE = 0.7022818047205439


In [97]:
for i, station in stations.items():
    train_1 = train_data_pd.loc[train_data_pd.station == station]
    test_1 = test_data_pd.loc[test_data_pd.station == station]

    train_1 = pd.DataFrame(train_1.rename(columns={'date': 'ds', 'temp_min': 'y'}))
    test_1 = pd.DataFrame(test_1.rename(columns={'date': 'ds', 'temp_min': 'y'}))

    models_min[i] = Prophet()
    models_min[i].add_regressor(name='temp_max')
    models_min[i].add_regressor(name='temp_mean')
    models_min[i].add_regressor(name='rainfall')
    models_min[i].add_regressor(name='snow')

    models_min[i].fit(train_1[['ds', 'y', 'temp_max', 'temp_mean', 'rainfall', 'snow']])

    future = models_min[i].make_future_dataframe(periods=7)
    future = future.merge(test_1[['ds', 'temp_max', 'temp_mean', 'rainfall', 'snow']], on='ds') 

    forecast_1 = models_min[i].predict(future)

    eval_m = (pd.DataFrame(test_1[['ds', 'y']])
                .merge(forecast_1[['ds', 'yhat']], on='ds')
                )
                        
    models_min[station] = np.sqrt(mean_squared_error(eval_m.y, eval_m.yhat))

10:57:38 - cmdstanpy - INFO - Chain [1] start processing
10:57:38 - cmdstanpy - INFO - Chain [1] done processing
10:57:39 - cmdstanpy - INFO - Chain [1] start processing
10:57:39 - cmdstanpy - INFO - Chain [1] done processing
10:57:39 - cmdstanpy - INFO - Chain [1] start processing
10:57:40 - cmdstanpy - INFO - Chain [1] done processing
10:57:40 - cmdstanpy - INFO - Chain [1] start processing
10:57:41 - cmdstanpy - INFO - Chain [1] done processing
10:57:41 - cmdstanpy - INFO - Chain [1] start processing
10:57:41 - cmdstanpy - INFO - Chain [1] done processing
10:57:42 - cmdstanpy - INFO - Chain [1] start processing
10:57:42 - cmdstanpy - INFO - Chain [1] done processing
10:57:42 - cmdstanpy - INFO - Chain [1] start processing
10:57:43 - cmdstanpy - INFO - Chain [1] done processing
10:57:43 - cmdstanpy - INFO - Chain [1] start processing
10:57:44 - cmdstanpy - INFO - Chain [1] done processing
10:57:44 - cmdstanpy - INFO - Chain [1] start processing
10:57:44 - cmdstanpy - INFO - Chain [1]

# Xgboost

In [121]:
def xgb_features(df, label=None):
    """
    Creates time series features
    """

    if not daily_df.date.dtype == 'datetime64[ns]' :
        df['date'] = pd.to_datetime(df['date'])
    
    
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    
    X = df[['month','year',
           'dayofyear','dayofmonth']]
    if label:
        y = df[label]
        X = df.drop(label, axis=1)
        return X, y
    return df

In [122]:
xgb_data = xgb_features(daily_df)

In [126]:
xgb_data  = xgb_data.sort_values(by='date', ascending=True)

In [134]:
xgb_data['station'] = pd.Categorical(xgb_data['station'])

In [132]:
tss = TimeSeriesSplit(n_splits=5, test_size=1)

In [149]:
def dict_configs(d):
    for vcomb in itertools.product(*d.values()):
        yield dict(zip(d.keys(), vcomb))

In [184]:
xgb_data.drop([y,'date'], axis=1).columns

Index(['station', 'temp_max', 'temp_mean', 'rainfall', 'snow', 'month', 'year',
       'dayofyear', 'dayofmonth'],
      dtype='object')

In [199]:
xgb_data = pd.get_dummies(xgb_data, columns=['station'])

In [201]:
param_grid = {
    'max_depth': np.arange(2, 10, 2),
    'n_estimators': np.arange(100, 1200, 300),
}

params =  {'learning_rate': 0.1,
    'objective': 'reg:squarederror',
    'booster': 'gbtree',
    'n_jobs': 4,
    # 'enable_categorical': True,
    'random_state': 42}

fold = 0
# preds = []
scores = []
y = "temp_min"
xgb_data.dropna(inplace=True)
features = xgb_data.drop([y,'date'], axis=1).columns
grid = dict_configs(param_grid)
for train_idx, val_idx in tss.split(xgb_data):
    train = xgb_data.iloc[train_idx]
    test = xgb_data.iloc[val_idx]
    cv_results = dict()
    for cv_params in grid:
        X_train = train[features]
        y_train = train[y]

        X_test = test[features]
        y_test = test[y]

        params.update(cv_params)
        print(params)
        reg = xgb.XGBRegressor(**params)
        reg.fit(X_train, y_train,
                eval_set=[(X_train, y_train), (X_test, y_test)],
                verbose=100)

        y_pred = reg.predict(X_test)
#       preds.append(y_pred)
        cv_results[str(cv_params)] = np.sqrt(mean_squared_error(y_test, y_pred))
        scores.append(cv_results)

{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 2, 'n_estimators': 100}
[0]	validation_0-rmse:12.89368	validation_1-rmse:2.98321




[99]	validation_0-rmse:1.63893	validation_1-rmse:3.75741
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 2, 'n_estimators': 400}
[0]	validation_0-rmse:12.89368	validation_1-rmse:2.98321




[100]	validation_0-rmse:1.63696	validation_1-rmse:3.80329
[200]	validation_0-rmse:1.34520	validation_1-rmse:4.16506
[300]	validation_0-rmse:1.26057	validation_1-rmse:4.21645
[399]	validation_0-rmse:1.21214	validation_1-rmse:4.22393
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 2, 'n_estimators': 700}




[0]	validation_0-rmse:12.89368	validation_1-rmse:2.98321




[100]	validation_0-rmse:1.63696	validation_1-rmse:3.80329
[200]	validation_0-rmse:1.34520	validation_1-rmse:4.16506
[300]	validation_0-rmse:1.26057	validation_1-rmse:4.21645
[400]	validation_0-rmse:1.21200	validation_1-rmse:4.21453
[500]	validation_0-rmse:1.18274	validation_1-rmse:4.22332
[600]	validation_0-rmse:1.16355	validation_1-rmse:4.17758
[699]	validation_0-rmse:1.15107	validation_1-rmse:4.15451
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 2, 'n_estimators': 1000}




[0]	validation_0-rmse:12.89368	validation_1-rmse:2.98321




[100]	validation_0-rmse:1.63696	validation_1-rmse:3.80329
[200]	validation_0-rmse:1.34520	validation_1-rmse:4.16506
[300]	validation_0-rmse:1.26057	validation_1-rmse:4.21645
[400]	validation_0-rmse:1.21200	validation_1-rmse:4.21453
[500]	validation_0-rmse:1.18274	validation_1-rmse:4.22332
[600]	validation_0-rmse:1.16355	validation_1-rmse:4.17758
[700]	validation_0-rmse:1.15090	validation_1-rmse:4.15430
[800]	validation_0-rmse:1.14128	validation_1-rmse:4.14280
[900]	validation_0-rmse:1.13363	validation_1-rmse:4.12746
[999]	validation_0-rmse:1.12765	validation_1-rmse:4.11672
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 4, 'n_estimators': 100}




[0]	validation_0-rmse:12.82417	validation_1-rmse:3.13025




[99]	validation_0-rmse:1.19899	validation_1-rmse:4.02462
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 4, 'n_estimators': 400}




[0]	validation_0-rmse:12.82417	validation_1-rmse:3.13025




[100]	validation_0-rmse:1.19585	validation_1-rmse:4.06644
[200]	validation_0-rmse:1.11087	validation_1-rmse:4.21440
[300]	validation_0-rmse:1.08346	validation_1-rmse:4.17895
[399]	validation_0-rmse:1.06875	validation_1-rmse:3.94289
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 4, 'n_estimators': 700}




[0]	validation_0-rmse:12.82417	validation_1-rmse:3.13025




[100]	validation_0-rmse:1.19585	validation_1-rmse:4.06644
[200]	validation_0-rmse:1.11087	validation_1-rmse:4.21440
[300]	validation_0-rmse:1.08346	validation_1-rmse:4.17895
[400]	validation_0-rmse:1.06872	validation_1-rmse:3.94291
[500]	validation_0-rmse:1.05853	validation_1-rmse:3.90732
[600]	validation_0-rmse:1.05037	validation_1-rmse:3.87196
[699]	validation_0-rmse:1.04302	validation_1-rmse:3.77094
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 4, 'n_estimators': 1000}




[0]	validation_0-rmse:12.82417	validation_1-rmse:3.13025




[100]	validation_0-rmse:1.19585	validation_1-rmse:4.06644
[200]	validation_0-rmse:1.11087	validation_1-rmse:4.21440
[300]	validation_0-rmse:1.08346	validation_1-rmse:4.17895
[400]	validation_0-rmse:1.06872	validation_1-rmse:3.94291
[500]	validation_0-rmse:1.05853	validation_1-rmse:3.90732
[600]	validation_0-rmse:1.05037	validation_1-rmse:3.87196
[700]	validation_0-rmse:1.04301	validation_1-rmse:3.77092
[800]	validation_0-rmse:1.03595	validation_1-rmse:3.50296
[900]	validation_0-rmse:1.02952	validation_1-rmse:3.42059
[999]	validation_0-rmse:1.02338	validation_1-rmse:3.39251
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 6, 'n_estimators': 100}




[0]	validation_0-rmse:12.80867	validation_1-rmse:3.40534




[99]	validation_0-rmse:1.07014	validation_1-rmse:4.22771
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 6, 'n_estimators': 400}




[0]	validation_0-rmse:12.80867	validation_1-rmse:3.40534




[100]	validation_0-rmse:1.06976	validation_1-rmse:4.22663
[200]	validation_0-rmse:1.03151	validation_1-rmse:4.26331
[300]	validation_0-rmse:1.00665	validation_1-rmse:4.03161
[399]	validation_0-rmse:0.98592	validation_1-rmse:3.92171
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 6, 'n_estimators': 700}




[0]	validation_0-rmse:12.80867	validation_1-rmse:3.40534




[100]	validation_0-rmse:1.06976	validation_1-rmse:4.22663
[200]	validation_0-rmse:1.03151	validation_1-rmse:4.26331
[300]	validation_0-rmse:1.00665	validation_1-rmse:4.03161
[400]	validation_0-rmse:0.98583	validation_1-rmse:3.92173
[500]	validation_0-rmse:0.96383	validation_1-rmse:3.84499
[600]	validation_0-rmse:0.94519	validation_1-rmse:3.73794
[699]	validation_0-rmse:0.92829	validation_1-rmse:3.57214
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 6, 'n_estimators': 1000}




[0]	validation_0-rmse:12.80867	validation_1-rmse:3.40534




[100]	validation_0-rmse:1.06976	validation_1-rmse:4.22663
[200]	validation_0-rmse:1.03151	validation_1-rmse:4.26331
[300]	validation_0-rmse:1.00665	validation_1-rmse:4.03161
[400]	validation_0-rmse:0.98583	validation_1-rmse:3.92173
[500]	validation_0-rmse:0.96383	validation_1-rmse:3.84499
[600]	validation_0-rmse:0.94519	validation_1-rmse:3.73794
[700]	validation_0-rmse:0.92806	validation_1-rmse:3.56616
[800]	validation_0-rmse:0.91088	validation_1-rmse:3.46387
[900]	validation_0-rmse:0.89447	validation_1-rmse:3.40117
[999]	validation_0-rmse:0.88068	validation_1-rmse:3.35970
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 8, 'n_estimators': 100}




[0]	validation_0-rmse:12.80393	validation_1-rmse:3.28254




[99]	validation_0-rmse:0.97210	validation_1-rmse:3.56742
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 8, 'n_estimators': 400}




[0]	validation_0-rmse:12.80393	validation_1-rmse:3.28254




[100]	validation_0-rmse:0.97194	validation_1-rmse:3.56742
[200]	validation_0-rmse:0.92758	validation_1-rmse:3.23465
[300]	validation_0-rmse:0.88707	validation_1-rmse:3.08126
[399]	validation_0-rmse:0.85081	validation_1-rmse:3.04132
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 8, 'n_estimators': 700}




[0]	validation_0-rmse:12.80393	validation_1-rmse:3.28254
[100]	validation_0-rmse:0.97194	validation_1-rmse:3.56742
[200]	validation_0-rmse:0.92758	validation_1-rmse:3.23465
[300]	validation_0-rmse:0.88707	validation_1-rmse:3.08126
[400]	validation_0-rmse:0.85049	validation_1-rmse:3.03922
[500]	validation_0-rmse:0.81939	validation_1-rmse:3.08713
[600]	validation_0-rmse:0.78812	validation_1-rmse:3.10330
[699]	validation_0-rmse:0.76033	validation_1-rmse:2.97322
{'learning_rate': 0.1, 'objective': 'reg:squarederror', 'booster': 'gbtree', 'n_jobs': 4, 'random_state': 42, 'max_depth': 8, 'n_estimators': 1000}




[0]	validation_0-rmse:12.80393	validation_1-rmse:3.28254




[100]	validation_0-rmse:0.97194	validation_1-rmse:3.56742
[200]	validation_0-rmse:0.92758	validation_1-rmse:3.23465
[300]	validation_0-rmse:0.88707	validation_1-rmse:3.08126
[400]	validation_0-rmse:0.85049	validation_1-rmse:3.03922
[500]	validation_0-rmse:0.81939	validation_1-rmse:3.08713
[600]	validation_0-rmse:0.78812	validation_1-rmse:3.10330
[700]	validation_0-rmse:0.75986	validation_1-rmse:2.96709
[800]	validation_0-rmse:0.73372	validation_1-rmse:2.97732
[900]	validation_0-rmse:0.70971	validation_1-rmse:2.85505
[999]	validation_0-rmse:0.68353	validation_1-rmse:2.85611




# XGB Lagged Feature


In [176]:

params =  {'learning_rate': 0.1,
    'objective': 'reg:squarederror',
    'booster': 'gbtree',
    'n_jobs': 4,
    'enable_categorical': True,
    'random_state': 42}

In [180]:
params.update({'max_depth': 5,
 'n_estimators': 4})

In [181]:
params

{'learning_rate': 0.1,
 'objective': 'reg:squarederror',
 'booster': 'gbtree',
 'n_jobs': 4,
 'enable_categorical': True,
 'random_state': 42,
 'max_depth': 5,
 'n_estimators': 4}

In [None]:
xgb_data["temp_mean"].ewm(alpha=.9, adjust=False).mean().shape

# XGB Restrict Months
#### only include months November and December 