# 7. Gas price and temperature correlations

In [None]:
#imports
import pandas as pd
import os
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
from datetime import timedelta

#folders
data_folder = "data"

#warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#defualt plot size 
size = {
    "width" : 1500 ,
    "height" : 750 ,
}

#function for plotting
def scale_show(fig):

    #set font
    fig.update_layout(
        font = dict(size=16),
        title_font = dict(size=20),
        xaxis_title_font = dict(size=18),
        yaxis_title_font = dict(size=18),
    )

    #set size
    fig.update_layout(
        width=1500,
        height=750,
    )

    #show
    fig.show()

    return

## 7.1 Data understanding (gathering, cleaning)
- source: https://www.eia.gov/dnav/ng/hist/rngwhhdD.htm

In [None]:
df = pd.read_csv(os.path.join(data_folder, "df_main.csv"))
df = df[["date", "t2m", "t2m_t1_cat"]]
df.head()

In [None]:
df_gas_raw = pd.read_csv(os.path.join(data_folder, "raw_gas", "raw_download.csv"))
df_gas_raw.tail()

In [None]:
df_gas = df_gas_raw
df_gas.isna().sum()

In [None]:
df_gas["gas_usd_spot"].interpolate(inplace = True, method = "linear")

In [None]:
df_gas = df_gas_raw
df_gas.isna().sum()

In [None]:
fig = px.line(
    data_frame= df_gas,
    x = "date",
    y = "gas_usd_spot",
    title = "Natural gas price, spot (USD per Million Btu)",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
df = pd.merge(right=df, left = df_gas, on = "date", how = "inner")
df

## 7.2 Data preparation

In [None]:
#calculate percentual change
df["t2m_change"] = (((df["t2m"] - df["t2m"].shift(1)) / df["t2m"].shift(1)) * 100).round(2)
df["gas_usd_spot_change"] = (((df["gas_usd_spot"] - df["gas_usd_spot"].shift(1)) / df["gas_usd_spot"].shift(1)) * 100).round(2)
df["date"] = pd.to_datetime(df["date"], format = "%Y-%m-%d")

df.dropna(inplace = True)
df

In [None]:
#standardize values to make them compareable
for col in ["gas_usd_spot", "t2m", "t2m_change", "gas_usd_spot_change"]:
    df[col] = (df[col] - df[col].mean()) / df[col].std()

In [None]:
fig = px.scatter(
    data_frame = df,
    x = "t2m_change",
    y = "gas_usd_spot_change",
    color_discrete_sequence = plt_style_s,
    title = "Gas price change and temperature change comparison",
    trendline = "ols",
    trendline_color_override = "red",
)

scale_show(fig)

In [None]:
fig = px.scatter(
    data_frame = df,
    x = "t2m",
    y = "gas_usd_spot",
    color_discrete_sequence = plt_style_s,
    title = "Gas price change and temperature change comparison",
    trendline = "ols",
    trendline_color_override = "red",
)

scale_show(fig)

In [None]:
fig = px.scatter(
    data_frame = df.loc[df["date"].dt.month.isin([10,11,12,1,2,3])],
    x = "t2m",
    y = "gas_usd_spot",
    color_discrete_sequence= plt_style_s,
    title = "Gas price change and temperature change comparison",
    trendline = "ols",
    trendline_color_override = "red",
    facet_col = "t2m_t1_cat",
)

scale_show(fig)

In [None]:
fig = px.scatter(
    data_frame = df.loc[df["date"].dt.month.isin([10,11,12,1,2,3])],
    x = "t2m_change",
    y = "gas_usd_spot_change",
    color_discrete_sequence = plt_style_s,
    title = "Gas price change and temperature change comparison",
    trendline = "ols",
    trendline_color_override = "red",
    facet_col = "t2m_t1_cat",
)

scale_show(fig)

In [None]:
fig = px.line(
    data_frame = df.iloc[-200:],
    x = "date",
    y = ["t2m_change", "gas_usd_spot_change"],
    title = "Change over time",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
fig = px.line(
    data_frame = df.iloc[-40000:],
    x = "date",
    y = ["t2m", "gas_usd_spot"],
    title = "Stanardized values over time",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
#apply z score peaks
#source: https://stackoverflow.com/questions/22583391/peak-signal-detection-in-realtime-timeseries-data/43512887#43512887

def thresholding_algo(y, lag, threshold, influence):
    """Robust peak detection algorithm (using z-scores)

    Args:
        y (_type_): y_vector / time series
        lag (_type_): the lag of the moving window
        threshold (_type_): the z-score at which the algorithm signals
        influence (_type_): the influence (between 0 and 1) of new signals on the mean and

    Returns:
        _type_: dict {
            singals
            avgFilter
            stdFilter
        }
    """

    signals = np.zeros(len(y))
    filteredY = np.array(y)
    avgFilter = [0]*len(y)
    stdFilter = [0]*len(y)

    avgFilter[lag - 1] = np.mean(y[0:lag])
    stdFilter[lag - 1] = np.std(y[0:lag])

    for i in range(lag, len(y)):
        if abs(y[i] - avgFilter[i-1]) > threshold * stdFilter [i-1]:
            if y[i] > avgFilter[i-1]:
                signals[i] = 1
            else:
                signals[i] = -1

            filteredY[i] = influence * y[i] + (1 - influence) * filteredY[i-1]
            avgFilter[i] = np.mean(filteredY[(i-lag+1):i+1])
            stdFilter[i] = np.std(filteredY[(i-lag+1):i+1])
        else:
            signals[i] = 0
            filteredY[i] = y[i]
            avgFilter[i] = np.mean(filteredY[(i-lag+1):i+1])
            stdFilter[i] = np.std(filteredY[(i-lag+1):i+1])

    return dict(signals = np.asarray(signals),
                avgFilter = np.asarray(avgFilter),
                stdFilter = np.asarray(stdFilter))

In [None]:
#get score
for col in ["t2m", "gas_usd_spot"]:


    df[f"{col}_zscore"] = thresholding_algo(y = df[col], lag = 360, threshold = 2.5, influence = 0.1)["signals"]

    #get peak spots for plotting
    df[f"{col}_zscore"] = (df[col] * df[f"{col}_zscore"]) * df[f"{col}_zscore"]

    #set others to NaN to for plotting as scatter
    df.loc[df[f"{col}_zscore"] == 0, f"{col}_zscore"] = None


In [None]:
fig = px.line(
    data_frame = df.iloc[-10000:],
    x = "date",
    y = "t2m",
    title = "Stanardized values over time",
    color_discrete_sequence = plt_style_s,
)

fig.add_scatter(
    x = df["date"],
    y = df["t2m_zscore"],
    mode = "markers",
    name = "local z score peaks t2m",
    marker = {"size" : 10},
    
)

scale_show(fig)

In [None]:
def fixed_lower_th(y, th = -2.4):

    signal = (y <= th).astype(int)
    signal = signal * -1

    return signal

In [None]:
#get score
for col in ["t2m", "gas_usd_spot"]:

    df[f"{col}_th"] = fixed_lower_th(y = df[col], th = -2.4)

    #get peak spots for plotting
    df[f"{col}_th"] = (df[col] * df[f"{col}_th"]) * df[f"{col}_th"]

    #set others to NaN to for plotting as scatter
    df.loc[df[f"{col}_th"] == 0, f"{col}_th"] = None

In [None]:
fig = px.line(
    data_frame = df.iloc[-10000:],
    x = "date",
    y = "t2m",
    title = "Stanardized values over time",
    color_discrete_sequence = plt_style_s,
)

fig.add_scatter(
    x = df["date"],
    y = df["t2m_th"],
    mode = "markers",
    name = "local z score peaks t2m",
    marker = {"size" : 10},
    
)

scale_show(fig)

In [None]:
df["t2m_zscore"] = thresholding_algo(y = df["t2m"], lag = 60, threshold = 3.5, influence = 0.5)["signals"]

In [None]:
df.loc[df["t2m_th"].isna() == False, "t2m_th"] = 10
df.loc[df["t2m_th"].isna() == True, "t2m_th"] = -10

df["t2m_zscore"] = df["t2m_zscore"].abs() * 20 - 10

In [None]:
fig = px.line(
    data_frame = df,
    x = "date",
    y = ["gas_usd_spot", "t2m_th", "t2m_zscore"],
    title = "Gas price",
    color_discrete_sequence = plt_style_s,
    
)

fig.update_layout(yaxis_range=[-3.5, 10])

scale_show(fig)

In [None]:
fig = px.line(
    data_frame = df,
    x = "date",
    y = ["gas_usd_spot", "t2m_zscore",],
    title = "Gas price",
    color_discrete_sequence = plt_style_s,
    
)

fig.update_layout(yaxis_range=[-3.5, 10])

scale_show(fig)

In [None]:
fig = px.scatter(
    data_frame = df,
    x = "date",
    y = "gas_usd_spot",
    color = "t2m_t1_cat",
    title = "Gas price",
    color_continuous_scale = plt_style_s,
)

scale_show(fig)