In [32]:
from numpy import mean
from pandas import Series
from sklearn.base import RegressorMixin


class RollingMeanRegressor(RegressorMixin):
    def __init__(self, win: int = 3):
        super().__init__()
        self.win_size = win
        self.memory: list = []

    def fit(self, X: Series):
        self.memory = X.iloc[-self.win_size :]
        # print(self.memory)
        return

    def predict(self, X: Series):
        estimations = self.memory.tolist()
        for i in range(len(X)):
            new_value = mean(estimations[len(estimations) - self.win_size - i :])
            estimations.append(new_value)
        prd_series: Series = Series(estimations[self.win_size :])
        print(prd_series)
        prd_series.index = X.index
        return prd_series

In [33]:
from dslabs_functions import FORECAST_MEASURES, DELTA_IMPROVE, plot_line_chart


def rolling_mean_study(train: Series, test: Series, measure: str = "R2"):
    # win_size = (3, 5, 10, 15, 20, 25, 30, 40, 50)
    win_size = (12, 24, 48, 96, 192, 384, 768)
    flag = measure == "R2" or measure == "MAPE"
    best_model = None
    best_params: dict = {"name": "Rolling Mean", "metric": measure, "params": ()}
    best_performance: float = -100000

    yvalues = []
    for w in win_size:
        pred = RollingMeanRegressor(win=w)
        pred.fit(train)
        prd_tst = pred.predict(test)

        eval: float = FORECAST_MEASURES[measure](test, prd_tst)
        # print(w, eval)
        if eval > best_performance and abs(eval - best_performance) > DELTA_IMPROVE:
            best_performance: float = eval
            best_params["params"] = (w,)
            best_model = pred
        yvalues.append(eval)

    print(f"Rolling Mean best with win={best_params['params'][0]:.0f} -> {measure}={best_performance}")
    plot_line_chart(
        win_size, yvalues, title=f"Rolling Mean ({measure})", xlabel="window size", ylabel=measure, percentage=flag
    )

    return best_model, best_params

In [34]:
from pandas import read_csv, DataFrame
from matplotlib.pyplot import figure, savefig
from dslabs_functions import series_train_test_split, plot_forecasting_eval, plot_forecasting_series, HEIGHT

filename: str = "/home/mina/Documents/portugal/dataScience/set_1_diff_twice.csv"
file_tag: str = "Set 1"
target: str = "Manhattan"
timecol: str = "Date"
measure: str = "R2"

data: DataFrame = read_csv(filename, index_col=timecol, sep=",", decimal=".", parse_dates=True)
series: Series = data[target]

train, test = series_train_test_split(data, trn_pct=0.90)

fig = figure(figsize=(HEIGHT, HEIGHT))
best_model, best_params = rolling_mean_study(train, test)
savefig(f"/home/mina/Documents/portugal/dataScience/Data_science_project/MODELS’ EVALUATION/images_rolling/{file_tag}_rollingmean_{measure}_study.png")

0    -12.4
1    -12.4
2    -12.4
3    -12.4
4    -12.4
5    -12.4
6    -12.4
7    -12.4
8    -12.4
9    -12.4
10   -12.4
11   -12.4
12   -12.4
13   -12.4
14   -12.4
15   -12.4
16   -12.4
17   -12.4
18   -12.4
dtype: float64
0    -4.3625
1    -4.3625
2    -4.3625
3    -4.3625
4    -4.3625
5    -4.3625
6    -4.3625
7    -4.3625
8    -4.3625
9    -4.3625
10   -4.3625
11   -4.3625
12   -4.3625
13   -4.3625
14   -4.3625
15   -4.3625
16   -4.3625
17   -4.3625
18   -4.3625
dtype: float64
0    -2.24375
1    -2.24375
2    -2.24375
3    -2.24375
4    -2.24375
5    -2.24375
6    -2.24375
7    -2.24375
8    -2.24375
9    -2.24375
10   -2.24375
11   -2.24375
12   -2.24375
13   -2.24375
14   -2.24375
15   -2.24375
16   -2.24375
17   -2.24375
18   -2.24375
dtype: float64
0    -2.10625
1    -2.10625
2    -2.10625
3    -2.10625
4    -2.10625
5    -2.10625
6    -2.10625
7    -2.10625
8    -2.10625
9    -2.10625
10   -2.10625
11   -2.10625
12   -2.10625
13   -2.10625
14   -2.10625
15   -2.10625
16   -2.1

ValueError: Length mismatch: Expected axis has 0 elements, new values have 19 elements

<Figure size 600x600 with 0 Axes>