# <p style="padding:15px; background-color:#fffaf6; font-family:JetBrains Mono; font-weight:bold; color:#3E3F4C; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Weather Forecast - EDA &amp; ARIMA &amp; RNN</p>

In [1]:
# %load ../initial_settings.py
import os
import shutil
import subprocess
import sys
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from colorama import Fore, Style
from IPython.display import display_html
from matplotlib.patches import PathPatch

# =============================================================================
# Indicates whether the notebook is running on Kaggle or not.
ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None

# Here we store models if these are interesting.
MODELS_PATH = Path("models")
MODELS_PATH.mkdir(exist_ok=True)  # If directory exists, do nothing.

CLR = (Style.BRIGHT + Fore.BLACK) if ON_KAGGLE else (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN
RESET = Style.RESET_ALL

# Matplotlib & Seaborn settings.
FONT_COLOR = "#141B4D"
BACKGROUND_COLOR = "#F6F5F5"
DF_CMAP = sns.light_palette("#2a357d", as_cmap=True)
MY_RC = {
    "axes.labelcolor": FONT_COLOR,
    "axes.labelsize": 10,
    "axes.labelpad": 15,
    "axes.labelweight": "bold",
    "axes.titlesize": 14,
    "axes.titleweight": "bold",
    "axes.titlepad": 15,
    "xtick.labelsize": 10,
    "xtick.color": FONT_COLOR,
    "ytick.labelsize": 10,
    "ytick.color": FONT_COLOR,
    "figure.titlesize": 14,
    "figure.titleweight": "bold",
    "figure.facecolor": BACKGROUND_COLOR,
    "figure.edgecolor": BACKGROUND_COLOR,
    "figure.dpi": 72,  # Locally Seaborn uses 72, meanwhile Kaggle 96.
    "font.size": 10,
    "font.family": "Serif",
    "text.color": FONT_COLOR,
}

sns.set_theme(rc=MY_RC)

notebook_palette = {
    "graphite_blue": "#26344E",
    "purple_blue1": "#2A357D",
    "purple_blue2": "#454D82",
    "dark_blue1": "#141B4D",
    "dark_blue2": "#0F173B",
    "light_white": "#F2F2F0",
    "light_beige1": "#F6F5F5",
    "light_beige2": "#FFFAF6",
    "orange": "#C73C1A",
    "graphite": "#3E3F4C",
}

pd.set_option("display.precision", 2)


<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <b><span style="color:#3E3F4C; font-size:20px; font-family:JetBrains Mono; margin-left: 10px;">
        Project Description 📜
    </span></b>
    <p style="color:#3E3F4C; font-size:16px;font-family: JetBrains Mono; margin-left: 25px; margin-right: 25px; margin-top: 15px; margin-bottom: 20px">
        In this notebook, our main focus is on the temperature variations that have occurred in Warsaw, Poland over the past 30 years. We will be examining a time series dataset and utilizing visualizations to better understand the data. Additionally, we will be employing ARIMA and Recurrent Neural Networks to predict the weather patterns for the year 2023.
    </p>
    <b><span style="color:#3E3F4C;font-size:20px;font-family:JetBrains Mono; margin-left: 10px;">
        This Notebook Covers 📔
    </span></b>
    <ul style="color:#3E3F4C; font-size:16px;font-family: JetBrains Mono;  margin-left: 10px; margin-right: 15px; margin-top: 15px; margin-bottom: 20px">
        <li>General informations about dataset.</li>
        <li>Time series EDA.</li>
        <li>Annual predictions with ARIMA.</li>
        <li>Annual predictions with Recurrent Neural Networks.</li>
    </ul>
</blockquote>

# <p style="padding:15px; background-color:#fffaf6; font-family:JetBrains Mono; font-weight:bold; color:#3E3F4C; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Dataset General Informations</p>

In [2]:
weather = pd.read_csv("warsaw.csv", parse_dates=["DATE"], index_col="DATE")
weather.head()


Unnamed: 0_level_0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,PRCP,SNWD,TAVG,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1993-01-01,PLM00012375,"OKECIE, PL",52.17,20.97,110.3,0.0,10.0,-8.3,,
1993-01-02,PLM00012375,"OKECIE, PL",52.17,20.97,110.3,,10.0,-14.9,,
1993-01-03,PLM00012375,"OKECIE, PL",52.17,20.97,110.3,0.0,10.0,-13.6,-9.7,
1993-01-04,PLM00012375,"OKECIE, PL",52.17,20.97,110.3,0.0,10.0,-10.5,-6.5,-13.3
1993-01-05,PLM00012375,"OKECIE, PL",52.17,20.97,110.3,0.0,10.0,-12.0,-8.9,-14.1


In [3]:
weather.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10954 entries, 1993-01-01 to 2022-12-31
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   STATION    10954 non-null  object 
 1   NAME       10954 non-null  object 
 2   LATITUDE   10954 non-null  float64
 3   LONGITUDE  10954 non-null  float64
 4   ELEVATION  10954 non-null  float64
 5   PRCP       9158 non-null   float64
 6   SNWD       1423 non-null   float64
 7   TAVG       10954 non-null  float64
 8   TMAX       7463 non-null   float64
 9   TMIN       5870 non-null   float64
dtypes: float64(8), object(2)
memory usage: 941.4+ KB


<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <b><span style="color:#3E3F4C; font-size:20px; font-family:JetBrains Mono; margin-left: 10px;">
        Datset Description 📜
    </span></b>
    <p style="color:#3E3F4C; font-size:16px;font-family: JetBrains Mono; margin-left: 25px; margin-right: 25px; margin-top: 15px; margin-bottom: 20px">
        The dataset comes from <a href="https://www.ncdc.noaa.gov/cdo-web/">Climate Data Online</a> website and includes the last 30 years (1993-2022) of daily weather measurements in Warsaw, Poland. Available attributes are as follows:
        <ul style="color:#3E3F4C; font-size:16px;font-family: JetBrains Mono;  margin-left: 10px; margin-right: 15px; margin-top: 15px; margin-bottom: 20px">
            <li><code>DATE</code> - Measurement date in the form: year-month-day.</li>
            <li><code>STATION</code> - Station ID.</li>
            <li><code>NAME</code> - Station name.</li>
            <li><code>LATITUDE</code> - Station latitude.</li>
            <li><code>LONGITUDE</code> - Station longitude.</li>
            <li><code>ELEVATION</code> - Station elevation.</li>
            <li><code>PRCP</code> - Precipitation.</li>
            <li><code>SNWD</code> - Snow depth.</li>
            <li><code>TAVG</code> - Average temperature</li>
            <li><code>TMAX</code> - Maximum temperature.</li>
            <li><code>TMIN</code> - Minimum temperature.</li>
        </ul>
    </p>
    <p style="color:#3E3F4C; font-size:16px;font-family: JetBrains Mono; margin-left: 25px; margin-right: 25px; margin-top: 15px; margin-bottom: 20px">
        As you can see, only the <code>TAVG</code> measurements are available each day. Thus we will focus on this.
    </p>
    <b><span style="color:#3E3F4C;font-size:20px;font-family:JetBrains Mono; margin-left: 10px;">
        How to Acquire Data 🕵
    </span></b>
    <p style="color:#3E3F4C; font-size:16px;font-family: JetBrains Mono; margin-left: 25px; margin-right: 25px; margin-top: 15px; margin-bottom: 20px">
        Go to <a href="https://www.ncdc.noaa.gov/cdo-web/">Climate Data Online</a> and choose "Browse Datasets". Then you can select which data you are looking for (in this notebook, it's "Daily Summaries"). Click on "Search Tool" and choose an interesting period, station, city, country, etc. After you place an order, the dataset should be sent to your email.
    </p>
</blockquote>

# <p style="padding:15px; background-color:#fffaf6; font-family:JetBrains Mono; font-weight:bold; color:#3E3F4C; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Time Series EDA</p>

<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">What's next? 🕵</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>We will start with the most obvious steps - summary values and time plots.</li>
    </ul>
</blockquote>

In [4]:
temp = weather[["TAVG"]].copy()
temp.describe().T.drop("count", axis=1).rename(columns=str.title)


Unnamed: 0,Mean,Std,Min,25%,50%,75%,Max
TAVG,9.11,8.67,-22.3,2.4,9.3,16.2,29.1


In [5]:
temp.query("TAVG == -22.3")

Unnamed: 0_level_0,TAVG
DATE,Unnamed: 1_level_1
2006-01-23,-22.3


In [6]:
temp.query("TAVG == 29.1")

Unnamed: 0_level_0,TAVG
DATE,Unnamed: 1_level_1
2015-08-07,29.1
2015-08-08,29.1


<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">Observations 📜</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>There was no relevant difference (virtually, there is no difference at all) between the mean and median temperature, being approximately $9.1$ and $9.3$ degrees Celsius respectively.</li>
        <li>The coldest day in the past 30 years was 2006-01-23, with an average temperature equal to around $-22.3$ degrees Celsius.</li>
        <li>On the other hand, 2015-08-07 and 2015-08-08 were the hottest days in Warsaw in the last 30 years, with an average temperature of around $29.1$ degrees Celsius.</li>
    </ul>
</blockquote>

In [7]:
fig = px.line(
    temp,
    y="TAVG",
    labels={"DATE": "Date", "TAVG": "Average Temperature (\u2103)"},  # Celsius degree.
    title="Average Daily Temperatures in Warsaw, Poland in 1993-2022 (Last 30 Years)",
    height=420,
    width=840,
)
fig.update_layout(
    font_color=FONT_COLOR,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
)
fig.update_traces(
    line=dict(width=1.0, color="#2A357D"),
    opacity=0.5,
)
fig.show()


In [8]:
fig = px.area(
    temp.loc["2022"],
    y="TAVG",
    labels={"DATE": "Date", "TAVG": "Average Temperature (\u2103)"},  # Celsius degree.
    title="Average Daily Temperatures in Warsaw, Poland in 2022",
    height=420,
    width=840,
)
fig.update_layout(
    font_color=FONT_COLOR,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
)
fig.update_traces(line=dict(width=1.5, color="#2A357D"))
fig.show()


<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">Observations 📜</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>The weather obviously has a seasonality pattern, such as temperatures being low at the beginning/end of the year and high in the middle of the year.</li>
        <li>Since plots contain daily temperatures, there are a little bit noisy, but we can see there are rather small changes between consecutive days.</li>
    </ul>
</blockquote>

<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">What's next? 🕵</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>Let's create the probability density distribution with a boxplot of average temperatures. Additionally, we can draw kernel density estimation (KDE), but <code>plotly</code> doesn't allow us to do this all automatically, so we need to calculate estimated values manually.</li>
    </ul>
</blockquote>

In [9]:
from scipy.stats import gaussian_kde


tavg_kde = gaussian_kde(temp.TAVG)
tavg_range = np.linspace(temp.TAVG.min(), temp.TAVG.max(), len(temp))
kde_estimated = tavg_kde.evaluate(tavg_range)

fig = px.histogram(
    temp,
    x="TAVG",
    marginal="box",
    histnorm="probability density",
    title="Probability Density of Daily Temperatures (Based on 30 Years of Measurements)",
    color_discrete_sequence=["#2A357D"],
    nbins=100,
    height=600,
    width=840,
)
fig.add_scatter(
    x=tavg_range,
    y=kde_estimated,
    showlegend=False,
    text="Average Temperature KDE",
    line=dict(dash="solid", color="#C73C1A", width=4),
)
fig.update_layout(
    font_color=FONT_COLOR,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    bargap=0.25,
    xaxis_title_text="Average Temperature (\u2103)",
    yaxis_title_text="Probability Density",
)
fig.show()


<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">Observations 📜</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>Here we have the probability density of daily temperatures.</li>
        <li>The interquartile range contains 50% of all measured temperatures. We can easily see that the Q1-Q3 range includes temperatures of approximately $2-16$ degrees Celsius.</li>
        <li>It is noticeable that extremely low temperatures were considered as anomalies. In fact, in Poland, experiencing temperatures as low as $-20$ degrees Celsius is highly uncommon in winter, whereas temperatures around $25-30$ degrees Celsius are quite frequent in summer.</li>
        <li>Also, we can spot that distribution resembles the Gaussian one. However, there is no single peak but two and a slight depression between them. Such a distribution is called a bimodal distribution. This means that the data is grouped around two different values or modes instead of being grouped around a single value. It indicates that the city has two distinct periods of weather separated by a temporary period.</li>
    </ul>
</blockquote>

<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">What's next? 🕵</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>Let's see how mean average temperatures are distributed over months.</li>
    </ul>
</blockquote>

In [10]:
tavg_monthly = (
    temp.groupby(temp.index.month_name(), sort=False).mean(numeric_only=True).reset_index()
)

fig = px.bar(
    tavg_monthly,
    x="DATE",
    y="TAVG",
    labels={"TAVG": "Mean Average Temperature (\u2103)", "DATE": "Month"},
    title="Mean Average Temperature by Month (Based on 30 Years of Measurements)",
    text_auto=".2f",
    color="TAVG",
    color_continuous_scale=px.colors.sequential.Cividis,
    height=500,
    width=840,
)
fig.update_layout(
    font_color=FONT_COLOR,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    coloraxis_colorbar_title_text="Temperature (\u2103)",
)
fig.show()


<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">Observations 📜</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>The two hottest months in Poland are July and August (the holiday period). On the other hand, January is the coldest month, where the average temperature drops below $0$ degrees Celsius.</li>
    </ul>
</blockquote>

<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">What's next? 🕵</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>Since we have a basic knowledge of the weather in Poland, it's good to examine it more profoundly. To do this, we'll use <code>seasonal_decompose</code> function from the <code>statsmodels</code> package.</li>
        <li>Seasonal decomposition using moving averages allows us to decompose a general signal into several components:</li>
        <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
            <li><code>level</code> - the mean value in the time series,</li>
            <li><code>seasonality</code> - the recurring cycle in the time series,</li>
            <li><code>trend</code> - a long-term component liable for increasing or decreasing in the data,</li>
            <li><code>noise</code> - random variation in the data.</li>
        </ul>
        <li>We will use an additive model (the above components account for a sum) within a moving average of over 365 days (one year).</li>
    </ul>
</blockquote>

In [11]:
from statsmodels.tsa.seasonal import seasonal_decompose


decomposition = seasonal_decompose(temp.TAVG, model="additive", period=365)

fig = make_subplots(
    rows=4,
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.1,
    x_title="Date",
    y_title="Temperature (\u2103)",
    subplot_titles=["Observed Values", "Trend", "Seasonality", "Residuals"],
)

observed = go.Scatter(
    x=decomposition.observed.index,
    y=decomposition.observed,
    name="Observed Temperature",
    line=dict(width=0.7, color="#2A357D"),
)
trend = go.Scatter(
    x=decomposition.trend.index,
    y=decomposition.trend,
    name="Trend",
    line=dict(color="#2A357D"),
)
seasonal = go.Scatter(
    x=decomposition.seasonal.index,
    y=decomposition.seasonal,
    name="Seasonality",
    line=dict(color="#2A357D"),
)
residuals = go.Scatter(
    x=decomposition.resid.index,
    y=decomposition.resid,
    name="Residuals",
    mode="markers",
    marker_size=1,
    line=dict(color="#2A357D"),
)

fig.add_trace(observed, row=1, col=1)
fig.add_trace(trend, row=2, col=1)
fig.add_trace(seasonal, row=3, col=1)
fig.add_trace(residuals, row=4, col=1)

fig.update_annotations(font_size=14)
fig.update_layout(
    font_color=FONT_COLOR,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    title_text="Average Daily Temperatures - Seasonal Decomposition",
    showlegend=False,
    height=800,
    width=840,
)
fig.show()


<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">Observations 📜</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>Noteworthy is the trend. As you can see, it's increasing, which can indicate global warming.</li>
        <li>After extracting the seasonality component, we see it as a perfect sinusoidal signal with annual frequency.</li>
        <li>The residual component may be slightly confusing, but it actually represents the signal with the removed trend and seasonal components.</li>
    </ul>
</blockquote>

<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">What's next? 🕵</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>Another essential thing we should do working with time series data is autocorrelation analysis.</li>
        <li>Autocorrelation is similar to correlation (which measures linear dependency between two variables), but autocorrelation measures the linear relationship between lagged time series values. For example $r_1$ measures relationship between $y_t$ and $y_{t-1}$, $r_2$ measures relationship between $y_t$ and $y_{t-2}$, and so on.</li>
        <li>We can use a dedicated function from the <code>statsmodels</code> library to make the <code>ACF</code> plot, but I'd like to use <code>plotly</code>, so we draw it by hand.</li>
    </ul>
</blockquote>

In [12]:
from statsmodels.tsa.stattools import acf


def draw_acf(series, n_lags, marker_size=12):
    corr_array = acf(series, alpha=0.05, nlags=n_lags)
    corr_values = corr_array[0]
    lags = np.arange(len(corr_values))
    lower_y = corr_array[1][:, 0] - corr_array[0]
    upper_y = corr_array[1][:, 1] - corr_array[0]

    fig = go.Figure()

    for l in lags:
        fig.add_scatter(
            x=(l, l), y=(0, corr_values[l]), mode="lines", line_color="black"
        )

    fig.add_scatter(
        x=lags,
        y=corr_values,
        mode="markers",
        marker_color="#2A357D",
        marker_size=marker_size,
        name="ACF",
    )
    fig.add_scatter(x=lags, y=upper_y, mode="lines", line_color="rgba(255,255,255,0)")
    fig.add_scatter(
        x=lags,
        y=lower_y,
        mode="lines",
        fillcolor="rgba(32, 146, 230, 0.3)",
        fill="tonexty",
        line_color="rgba(255, 255, 255, 0)",
    )
    fig.update_traces(showlegend=False)
    fig.update_xaxes(range=[-1, n_lags + 1])
    fig.update_yaxes(zerolinecolor="black")
    fig.update_layout(
        font_color=FONT_COLOR,
        plot_bgcolor=BACKGROUND_COLOR,
        paper_bgcolor=BACKGROUND_COLOR,
        title_text="Autocorrelation (ACF)",
        xaxis_title="Lag (Months)",
        yaxis_title="ACF",
        height=500,
        width=840,
    )
    fig.show()


In [13]:
df_monthly = temp.resample("M").mean(numeric_only=True)
draw_acf(df_monthly, n_lags=12)


In [14]:
draw_acf(df_monthly, n_lags=120, marker_size=6)

<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">Observations 📜</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>Let's see the first plot. We have autocorrelation on the y-axis and lag on the x-axis. Therefore, if $x=1$, we observe the correlation between January and February, January and December, and so on. If the $x=2$, we see the correlation between January and March, January and November, etc. As we remember, the data has a strong seasonality, which is also confirmed here. For example, January and June have a strong negative correlation ($x=6$).</li>
        <li>As for the second, it shows trend and seasonality in ACF. When data is seasonal, autocorrelations will be higher for the seasonal lags (or lower if there is a negative correlation). When data has a trend, ACF of trended time series tend to have positive values that slowly decrease and negative values that slowly increase as the lags increase.</li>
    </ul>
</blockquote>

# <p style="padding:15px; background-color:#fffaf6; font-family:JetBrains Mono; font-weight:bold; color:#3E3F4C; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Annual Predictions - Simple Approach</p>

<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">General remarks 📔</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>Here we move on to forecasting annual temperature in Warsaw, Poland. In the notebook, we will see different forecasting methods, and it's good to set some periods we want to forecast. Since forecasting daily temperatures may be too complex and not exact, I decided to predict weekly temperatures. Therefore we need to acquire $52$ values in each method.</li>
        <li>In general, there are some methods that are extremely simple but effective simultaneously. These are, for example, the average method and the naive method.</li>
        <li>To train a model, it's reasonable to use at least three last years of historical data, but rather no more than ten years. It should provide enough data to capture the underlying pattern and trends, preventing it from becoming too complex. Let's say we'll take the last five years.</li>
    </ul>
</blockquote>

<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">What's next? 🕵</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>We will start with the average method. Here the forecast of all future values is based on the average of historical measurements. Therefore the forecast for the first week of the new year consists of the mean of the first week of the last five years.</li>
        <li>For the naive forecast, we suppose that these are the latest observed values. Thus, we simply take observations from the previous year.</li>
    </ul>
</blockquote>

In [17]:
train_series_weekly = temp.loc["2017":"2021"].resample("7D").mean()
train_series_daily = temp.loc["2017":"2021"]

avg_forecast_weekly = (
    train_series_daily.groupby(train_series_daily.index.isocalendar().week)
    .mean()
    .set_index(pd.date_range("2022-01-01", periods=53, freq="7D"))
)

train_test_series_weekly = temp.loc["2017":"2022"].resample("7D").mean()
train_with_test_forecast = pd.concat([train_series_weekly, avg_forecast_weekly])

fig = px.line(
    train_test_series_weekly,
    y="TAVG",
    labels={"DATE": "Date", "TAVG": "Average Temperature (\u2103)"},  # Celsius degree.
    title="Average Weekly Temperatures & Average Forecast",
    height=420,
    width=840,
)
fig.update_traces(line=dict(width=1.5, color="#2A357D"), opacity=0.7)
fig.add_scatter(
    x=avg_forecast_weekly.index,
    y=avg_forecast_weekly.TAVG,
    mode="markers+lines",
    marker=dict(symbol="x", size=6),
    line=dict(color="#C73C1A", width=1.5),
    text="Temperature Forecast",
    name="Average Forecast",
)
fig.update_layout(
    font_color=FONT_COLOR,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    legend=dict(orientation="h", yanchor="bottom", xanchor="right", y=1.02, x=1),
)
fig.show()


In [24]:
from sklearn.metrics import mean_absolute_error

actual_weekly = temp.loc["2022"].resample("7D").mean()
avg_method_mae = mean_absolute_error(actual_weekly, avg_forecast_weekly)
print(CLR + "Average Method - Mean Absolute Error: ", f"{RED}{avg_method_mae:.2f}{RESET}")


[1m[37mAverage Method - Mean Absolute Error:  [1m[31m2.11[0m


In [25]:
naive = temp.loc["2021"].resample("7D").mean()
actual = temp.loc["2022"].resample("7D").mean()
naive_method_mae = mean_absolute_error(actual, naive)
print(CLR + "Naive Method - Mean Absolute Error: ", f"{RED}{naive_method_mae:.2f}{RESET}")


[1m[37mNaive Method - Mean Absolute Error:  [1m[31m2.82[0m


<p style="font-size:20px; font-family:JetBrains Mono; color:#3E3F4C; border-bottom: 3px solid #2a357d">Observations 📜</p>
<blockquote style="margin-right:auto; margin-left:auto; background-color:#fffaf6; padding: 15px; border-radius: 10px 10px">
    <ul style="font-size:16px; font-family:JetBrains Mono; color:#3E3F4C; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
        <li>Predictions from the average approach seem to be smoother than the actual measurements, but it is understandable. Averaging gets rid of the noise in a certain way.</li>
        <li>In the average method, we've got $MAE=2.11$, which means we make an error of around 2 degrees Celsius each week. Is that a significant error? It's hard to say, but it is not terrible at first glance.</li>
        <li>On the other hand, in the naive method, we've got $MAE=2.82$, which is slightly worse</li>
    </ul>
</blockquote>

# <p style="padding:15px; background-color:#fffaf6; font-family:JetBrains Mono; font-weight:bold; color:#3E3F4C; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Annual Predictions - ARIMA Approach</p>