# 2. Data preparation

In [None]:
#imports
import pandas as pd
import os
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime

#folders
data_folder = "data"

#warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#set if data should be safed
save_data = False

In [None]:
#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#defualt plot size 
size = {
    "width" : 1500 ,
    "height" : 750 ,
}

#function for plotting
def scale_show(fig):

    #set font
    fig.update_layout(
        font = dict(size=16),
        title_font = dict(size=20),
        xaxis_title_font = dict(size=18),
        yaxis_title_font = dict(size=18),
    )

    #set size
    fig.update_layout(
        width=1500,
        height=750,
    )

    #show
    fig.show()

    return

## 2.1 Main data frame feature engineering

In [None]:
df = pd.read_csv(os.path.join(data_folder, "df.csv"))
df.head()

In [None]:
#clean up
df.drop(labels = [col for col in df.columns.tolist() if "unnamed" in col.lower()], axis = 1, inplace = True)

In [None]:
day : int = 24 * 60 * 60 #[sec]
year : int = day * 366 #[sec] : 1,\n2020 was a leap year

year

In [None]:
#creat col
df["timestamp"] = pd.to_datetime(df["date"]).apply(datetime.timestamp)

#calculate values
day : int = 24 * 60 * 60 #[sec]
year : int = day * 366 #[sec] : 1,\n2020 was a leap year

#set columns
df["year_sin"] = np.sin(df["timestamp"] * (2*np.pi / year))
df["year_cos"] = np.cos(df["timestamp"] * (2*np.pi / year))

#del unneedec col
df.drop(labels = ["timestamp"], axis = 1, inplace = True)
df.head()

In [None]:
# add wind speed and direction
fig = px.line(
    data_frame = df.iloc[15000:],
    x = "date",
    y = ["year_sin", "year_cos"],

    title = "Year sine and cosine",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
# add rolling mean for ao, soi, t2m

cols_rolling_mean = ["soi", "ao", "mjo_amplitude", "t2m", "nao"]
offset = 30 #days

for col in cols_rolling_mean:

    #get rolling mean
    col_name = f"ma_{col}"
    df[col_name] = df[col].rolling(offset).mean()

    #create plot
    fig = px.line(
        data_frame = df[15000:],
        y = [col, col_name],
        x = "date",
        title = f"Moving average: {col}",
        color_discrete_sequence = plt_style_s
    )

    scale_show(fig)

In [None]:
# add wind speed
df['wind_speed'] = np.sqrt(df['u10']**2 + df['v10']**2)

#add wind direction
df['wind_direction'] = np.rad2deg(np.arctan2(df['u10'], df['v10'])) % 360
df['wind_direction'] = (df['wind_direction'] + 90) % 360

df.head().T

In [None]:
fig = px.histogram(
    data_frame = df,
    x = "wind_direction",
    histnorm = "probability density",
    title = "Wind direction",
 
    color = "month",
    barmode = "stack",
    opacity = 1,

    nbins = 180,

    labels = {"wind_direction" : "wind direction [°]"},

    color_discrete_sequence = plt_style_c,
)

scale_show(fig)

In [None]:
fig = px.histogram(
    data_frame = df,
    x = "wind_speed",
    histnorm = "probability density",
    title = "wind_speed",
 
    color = "month",
    barmode = "stack",
    opacity = 1,

    nbins = 180,

    labels = {"wind_speed" : "wind speed [m/s]"},

    color_discrete_sequence = plt_style_c,
)

scale_show(fig)

In [None]:
# add peak identifiers for mjo, ao, soi

In [None]:
df.set_index(keys = "date", inplace = True)

if save_data:
    df.to_csv(os.path.join(data_folder, "df_fe.csv"))

## 2.2 Polar vortex index engineering

read some: https://www.severe-weather.eu/global-weather/strong-polar-vortex-warming-collapse-event-forecast-spring-2022-usa-europe-fa/

analyzing patterns during break down events

In [None]:
df = pd.read_csv(os.path.join(data_folder, "df_pv.csv"))
#df["date"] = pd.to_datetime(df["date"])
df["size"] = 1
df.head()

interpolate preassure levels

In [None]:
#interplolate data for plotting
missing_levels = [40,60,80,90]

dates   = df["date"].unique()
lons    = df["longitude"].unique()
lats    = df["latitude"].unique()
lvls    = missing_levels

interpolation_data = {
    "date"          :[],
    "longitude"     :[],
    "latitude"      :[],
    "level"         :[],
}

#ugly and inefficient code

for date in dates:
    for lon in lons:
        for lat in lats:
            for lvl in lvls:

                interpolation_data["date"].append(date)
                interpolation_data["longitude"].append(lon)
                interpolation_data["latitude"].append(lat)
                interpolation_data["level"].append(lvl)

df_int = pd.DataFrame(interpolation_data)
df_int.head()

#merge dfs
df = pd.concat(objs = [df, df_int])
df.sort_values(by = ["date", "longitude", "latitude", "level"], inplace = True, axis = 0)

#clean up
df.reset_index(inplace = True)

#interpolate values

df["interpolated"] = 0
df.loc[df["level"].isin(missing_levels), "interpolated"] = 1

df.interpolate(metohd = "linear", inplace = True)

#recalculate speed and dircetion for interpolated values

#speed
df.loc[df["interpolated"] == 1, "speed"] = np.sqrt(df["u"] ** 2 + df["v"] ** 2)

df.loc[df["interpolated"] == 1, 'direction'] = np.rad2deg(np.arctan2(df['u'], df['v'])) % 360
df.loc[df["interpolated"] == 1, 'direction'] = (df['direction'] + 90) % 360

df.head(20)

In [None]:
if save_data is True:
    df_pv_clustering = df[["date", "longitude", "latitude", "level", "t", "speed"]].loc[df["longitude"] == 8]
    df_pv_clustering.set_index("date", drop = True, inplace = True)
    df_pv_clustering.to_csv(os.path.join(data_folder, "df_pv_clustering.csv"))

In [None]:
plot_scaler = 20
fig = px.scatter(
    data_frame = df.loc[(df["date"] >= "2019-03-01") & (df["date"] <= "2019-03-04")],
    y = "level",
    x = "latitude",
    color = "speed",
    size = "size",
    size_max = 1 * plot_scaler - 1,
    opacity = 1,
    facet_row = "date",
    #animation_frame = "date",

    height = (15 * 4)  * plot_scaler,
    width = 60 * plot_scaler,
    color_continuous_scale =  plt_style_s,

    title = "Polar vortex wind speed",

    labels = {"speed" : "speed [m/s]"},
)

fig['layout']['yaxis']['autorange'] = "reversed"

fig.update_traces(
    marker=dict(symbol="square",),
    selector=dict(mode='markers')
)

fig.show()

In [None]:
plot_scaler = 20
fig = px.scatter(
    data_frame = df.loc[(df["date"] >= "2019-03-01") & (df["date"] <= "2019-03-04")],
    y = "level",
    x = "latitude",
    color = "t",
    size = "size",
    size_max = 1 * plot_scaler - 1,
    opacity = 1,
    facet_row = "date",
    #animation_frame = "date",

    height = (15 * 4)  * plot_scaler,
    width = 60 * plot_scaler,
    color_continuous_scale =  plt_style_s,

    title = "Polar vortex temperature",

    labels = {"t" : "t [k°]"},
)

fig['layout']['yaxis']['autorange'] = "reversed"

fig.update_traces(
    marker=dict(symbol="square",),
    selector=dict(mode='markers')
)

fig.show()

In [None]:
#drop latitude 90. Looks like a cut in the data
df.drop(df.loc[df["latitude"] == 90].index, inplace = True)


plot_scaler = 20
fig = px.scatter(
    data_frame = df.loc[(df["date"] >= "1979-01-23") & (df["date"] <= "1979-01-30")],
    y = "level",
    x = "latitude",
    color = "speed",
    size = "size",
    size_max = 1 * plot_scaler - 1,
    opacity = 1,
    facet_row = "date",
    #animation_frame = "date",

    height = (20 * 6)  * plot_scaler,
    width = 60 * plot_scaler,
    color_continuous_scale =  plt_style_s,

    title = "Polar vortex wind speed",

    labels = {"speed" : "speed [m/s]"},
)

fig['layout']['yaxis']['autorange'] = "reversed"

fig.update_traces(
    marker=dict(symbol="square",),
    selector=dict(mode='markers')
)

fig.show()

Costuom index 1 (wind speed only)
 - Convolution on lattitide level (vertical bar with pieces)
 - Detect border of vortex by wind speeds (get most constant latitude by std, invert it, multiply by wind speed)
 - value_0 = border_height (0 .. n, 0 = switzerland, n = north pole)
 - value_1 = temperature  at height

In [None]:
def pv_i1_w(df):

    #drop unneded cols
    df_i1 = df[["date", "latitude", "t", "speed"]]

    #crate latitude aggregation
    df_i1 = df_i1.groupby(by = ["date", "latitude"], as_index = False).aggregate(
        speed_mean = ("speed", "mean"),
        speed_std = ("speed", "std"),
        t_mean =    ("t", "mean"),
    )

    #normalize the standard deviation
    # source: https://business.blogthinkbig.com/warning-about-normalizing-data/

    #get local min and max
    df_i1_minmax = df_i1[["date", "speed_std"]]
    df_i1_minmax = df_i1_minmax.groupby(by = ["date"], as_index = False).aggregate(
        speed_std_max = ("speed_std", "max"),
        speed_std_min = ("speed_std", "min"),
    )

    #append data together
    df_i1["date"] = df_i1["date"].astype(str)
    df_i1_minmax["date"] = df_i1_minmax["date"].astype(str)

    df_i1["date"] = pd.to_datetime(df_i1["date"])
    df_i1_minmax["date"] = pd.to_datetime(df_i1_minmax["date"])

    df_i1 = df_i1.merge(right = df_i1_minmax, on = "date", how = "left")

    #calculate normalized std
    df_i1["speed_norm_inv"] = 1 - (df_i1["speed_std"] - df_i1["speed_std_min"]) / (df_i1["speed_std_max"] - df_i1["speed_std_min"])

    #calculate weighted wind speed 
    df_i1["speed_weight"] = df_i1["speed_mean"] * df_i1["speed_norm_inv"]

    #get local max and min for index based on date
    df_i1_max = df_i1[["date", "speed_weight"]].groupby(by = "date", as_index = False).max()
    df_i1_max["is_max"] = 1

    #get the lattitude at max
    df_i1 = df_i1.merge(right = df_i1_max, on = ["date", "speed_weight"], how = "left")
    df_i1 = df_i1.loc[df_i1["is_max"] == 1]

    #clean up
    df_i1 = df_i1[["date", "latitude", "t_mean"]]
    df_i1_w = df_i1; del df_i1

    return df_i1_w

In [None]:
df_i1_w = pv_i1_w(df)
df_i1_w.head()

In [None]:
def plot_i1(df, df_i1, plot_param):

    dates = [
        "2019-03-01", "2019-03-02", "2019-03-03", "2019-03-04"
        #"1979-04-01","1979-05-01","1979-06-01",
        #"1979-07-01","1979-08-01","1979-09-01",
        #"1979-10-01","1979-11-01","1979-12-01",
    ]

    if plot_param == "t":
        title = "temperature"
        widht_correction = 2
    elif plot_param == "speed":
        title = "wind speed"
        widht_correction = 4

    for date in dates:

        plot_scaler = 20

        fig = px.scatter(
            data_frame = df.loc[df["date"] == date],
            x = "latitude",
            y = "level",
            color = plot_param,
            size = "size",
            size_max = 1 * plot_scaler -7,
            opacity = 1,
            facet_col = "date",
            #animation_frame = "date",

            height = 20  * plot_scaler,
            width = 60 * plot_scaler,
            color_continuous_scale =  plt_style_s,


            title = f"Polar vortex {title}",

            labels = {"speed" : "speed [m/s]"},
        )

        fig.update_traces(
            marker=dict(symbol="square",),
            selector=dict(mode='markers')
        )

        #indicator
        x = float(df_i1.loc[df_i1["date"] == date]["latitude"])
        
        fig.add_vline(
            x = x
        )

        #reverse axis
        fig['layout']['yaxis']['autorange'] = "reversed"

        fig.show()

In [None]:
plot_i1(df, df_i1_w, "t")

Costuom index 1 (wind speed + tempereautre)
 - Convolution on lattitide level (vertical bar with pieces)
 - Detect border of vortex by wind speeds and temperature (get most constant latitude by std, invert it, multiply by wind speed)
 - value_0 = border_height (0 .. n, 0 = switzerland, n = north pole)
 - value_1 = temperature  at height

In [None]:
def pv_i1_wt(df, speed_weight = 0.3, t_weight = 0.7):

    #drop unneded cols
    df_i1 = df[["date", "latitude", "t", "speed"]]

    #crate latitude aggregation
    df_i1 = df_i1.groupby(by = ["date", "latitude"], as_index = False).aggregate(
        speed_mean = ("speed", "mean"),
        speed_std = ("speed", "std"),
        t_mean =    ("t", "mean"),
    )

    #normalize the standard deviation
    # source: https://business.blogthinkbig.com/warning-about-normalizing-data/

    #get local min and max
    df_i1_minmax = df_i1[["date", "speed_std", "t_mean"]]
    df_i1_minmax = df_i1_minmax.groupby(by = ["date"], as_index = False).aggregate(
        speed_std_max   = ("speed_std", "max"),
        speed_std_min   = ("speed_std", "min"),
        t_mean_max      = ("t_mean", "max"),
        t_mean_min      = ("t_mean", "min"),
    )

    #append data together
    df_i1["date"] = df_i1["date"].astype(str)
    df_i1_minmax["date"] = df_i1_minmax["date"].astype(str)

    df_i1["date"] = pd.to_datetime(df_i1["date"])
    df_i1_minmax["date"] = pd.to_datetime(df_i1_minmax["date"])

    df_i1 = df_i1.merge(right = df_i1_minmax, on = "date", how = "left")

    #calculate normalized std
    df_i1["speed_norm_inv"] = 1 - (df_i1["speed_std"] - df_i1["speed_std_min"]) / (df_i1["speed_std_max"] - df_i1["speed_std_min"])
    df_i1["t_norm_inv"] = 1 - (df_i1["t_mean"] - df_i1["t_mean_min"]) / (df_i1["t_mean_max"] - df_i1["t_mean_min"])

    #calculate weighted wind speed
    df_i1["weight"] = df_i1["speed_mean"] * (df_i1["speed_norm_inv"] * speed_weight + df_i1["t_norm_inv"] * t_weight)

    #get local max and min for index based on date
    df_i1_max = df_i1[["date", "weight"]].groupby(by = "date", as_index = False).max()
    df_i1_max["is_max"] = 1

    #get the lattitude at max
    df_i1 = df_i1.merge(right = df_i1_max, on = ["date", "weight"], how = "left")
    df_i1 = df_i1.loc[df_i1["is_max"] == 1]

    #clean up
    df_i1 = df_i1[["date", "latitude", "t_mean"]]
    df_i1_wt = df_i1; del df_i1

    return df_i1_wt

In [None]:
df_i1_wt = pv_i1_wt(df)
df_i1_wt.head()

In [None]:
plot_i1(df, df_i1_wt, "speed") #"speed", "t"

In [None]:
del df_i1_w, df_i1_wt

 Custom index 2 (edge detection):
 - detect sharp borders of temperature changes
 - create a contrast values accors n vertical rows
 - detect sharpest border, based on threshold
 - if no value recheas the threshold, a breakdown can be detected

In [None]:
# Custom index idea 2:
# - detect sharp borders of temperature changes
# - create a contrast values accors n vertical rows
# - detect sharpest border, based on threshold
# - if no value recheas the threshold, a breakdown can be detected

In [None]:
class PV_ind_2_v2():

    def main(df, n_lat, threshold, break_down_offset, break_down_sensitivity, metric):
        """metric: ["t", "speed"]"""

        df_i2 = PV_ind_2_v2.get_delta(df, n_lat, metric)
        df_i2 = PV_ind_2_v2.get_local_max_delta(df_i2)
        df_i2 = PV_ind_2_v2.apply_threshold(df_i2, threshold)
        df_i2 = PV_ind_2_v2.detect_breakdown(df_i2, break_down_offset, break_down_sensitivity)

        return df_i2

    def get_delta(df, n, metric):

        #drop unneded cols
        df_i2 = df[["date", "latitude", "t", "speed"]]
        del df

        #crate latitude aggregation
        df_i2 = df_i2.groupby(by = ["date", "latitude"], as_index = False).aggregate(
            speed_mean  = ("speed", "mean"),
            #speed_std   = ("speed", "std"),
            t_mean      = ("t", "mean"),
            #t_std       = ("t", "std"),
        )

        df_i2.sort_values(by = ["date", "latitude"], ascending = [True , False], inplace = True)

        #create offshift for border detection
        cols = [f"{metric}_mean"]

        for i in range(1, n+1):
            df_i2[f"{metric}_mean_-{i}"] = df_i2[f"{metric}_mean"].shift(-i)
            cols.append(f"{metric}_mean_{-i}")

        #create deltas for border detection (square values to only get positive values and highlight bigger deltas)
        cols_delta = []
        for i in range(n):
            df_i2[f"delta_{i}"] = abs(df_i2[cols[i]] - df_i2[cols[i + 1]])
            cols_delta.append(f"delta_{i}")

        #sum deltas to get border value
        df_i2["delta"] = df_i2[cols_delta].mean(axis=1)

        #drop lower n cols
        lats = df_i2["latitude"].unique()
        lats.sort()
        lats = lats[:n]

        df_i2.loc[df_i2["latitude"].isin(lats), "delta"] = None

        #clean up
        df_i2.drop(labels = cols[1:] + cols_delta, axis = 1, inplace = True)

        return df_i2

    def get_local_max_delta(df_i2):
        
        #get max values
        df_i2_max = df_i2[["date", "delta"]].groupby(by = ["date"], as_index = False).max()
        df_i2_max["is_max"] = 1

        #set max in master df
        df_i2 = df_i2.merge(right = df_i2_max, on = ["date", "delta"], how = "left")
        #df_i2["is_max"] = df_i2["is_max"].fillna(0)
        #df_i2["is_max"] = df_i2["is_max"].astype(int)

        #clean up
        del df_i2_max
        return df_i2

    def apply_threshold(df_i2, threshold):

        #set th as multiplicator
        th = threshold + 1

        #get mean
        df_i2_d = df_i2[["date","delta"]].groupby(by = "date", as_index = False).aggregate(
            mean_delta = ("delta", "mean"),
        )

        #combine and apply th
        df_i2 = df_i2.merge(right = df_i2_d, on = "date", how = "left")
        df_i2["mean_delta"] = df_i2["mean_delta"] * th

        #compare
        df_i2["pv_edge"] = 0
        df_i2.loc[(df_i2["is_max"] == 1) & (df_i2["delta"] > df_i2["mean_delta"]), "pv_edge"] = 1

        #clean up
        df_i2.dropna(subset = "is_max", inplace = True)
        df_i2.drop(labels = ["is_max"], axis = 1, inplace = True)

        return df_i2

    def detect_breakdown(df_i2, break_down_offset, break_down_sensitivity):

        #considered months
        considered_months = [12,1,2,3,4]
        df_i2["date"] = pd.to_datetime(df_i2["date"])

        #consider offset by applying rolling mean.
        df_i2["pv_edge_offset"] = df_i2["pv_edge"].rolling(break_down_offset).mean()

        #break down event

        df_i2["pv_break_down_event"] = 0
        df_i2.loc[
            (df_i2["pv_edge_offset"] <= break_down_sensitivity) &
            (df_i2["pv_edge_offset"].shift(1) > break_down_sensitivity) &
            (df_i2["date"].dt.month.isin(considered_months))
            , "pv_break_down_event"] = 1

        #clean up
        df_i2.drop(labels = "pv_edge_offset", axis = 1, inplace = True)

        return df_i2

In [None]:
#params
n_lat                   = 3 #lat offset for calultion
threshold               = 1 #in % for vortex, multiple which has to be exceeded from popluation mean, to be counted as a pv edge
break_down_offset       = 3 #last n span, used to detect a break down
break_down_sensitivity  = 0.1 # in %. Sets the barrier or threshold, at which the value crossing it, a breakdown will be detected

df_i2_s = PV_ind_2_v2.main(
    df = df,
    n_lat = n_lat,
    threshold = threshold,
    break_down_offset = break_down_offset,
    break_down_sensitivity = break_down_sensitivity,
    metric = "speed",
)

df_i2_s.head(5)

In [None]:
#params
n_lat                   = 2 #lat offset for calultion
threshold               = 1 #in % for vortex, multiple which has to be exceeded from popluation mean, to be counted as a pv edge
break_down_offset       = 3 #last n span, used to detect a break down
break_down_sensitivity  = 0.1 # in %. Sets the barrier or threshold, at which the value crossing it, a breakdown will be detected

df_i2_t = PV_ind_2_v2.main(
    df = df,
    n_lat = n_lat,
    threshold = threshold,
    break_down_offset = break_down_offset,
    break_down_sensitivity = break_down_sensitivity,
    metric = "t",
)

df_i2_t.head(5)

In [None]:
def plot_i2(df, df_i, plot_param, dates = None):

    if dates is None:

        dates = [
            "2019-03-01", "2019-03-02", "2019-03-03", "2019-03-04", "2019-03-05",
            #"2022-04-01","2022-05-01","2022-06-01",
            #"2022-07-01","2022-08-01","2022-09-01",
            #"2022-10-01","2022-11-01","2022-12-01",
        ]

    if plot_param == "t":
        title = "temperature"
        color_bounds = [190, 230]
    elif plot_param == "speed":
        title = "wind speed"
        color_bounds = [0, 50]

    for date in dates:

        plot_scaler = 20

        fig = px.scatter(
            data_frame = df.loc[df["date"] == date],
            x = "latitude",
            y = "level",
            color = plot_param,
            size = "size",
            size_max = 1 * plot_scaler - 3,
            opacity = 1,
            facet_col = "date",
            #animation_frame = "date",
            range_color = color_bounds,

            height = 20  * plot_scaler,
            width = 60 * plot_scaler,
            color_continuous_scale  = plt_style_s,

            title = f"Polar vortex: {title}",

            labels = {"speed" : "speed [m/s]", "t" : "t [°k]"},
        )

        fig.update_traces(
            marker=dict(symbol="square",),
            selector=dict(mode='markers')
        )

        #indicator
        x = float(df_i.loc[df_i["date"] == date]["latitude"])
        #th_exceeded = int(df_i1.loc[df_i1["date"] == date]["pv_edge"])

        fig['layout']['yaxis']['autorange'] = "reversed"

        fig.add_vline(
            x = x,
            #line_color = "black",
            #line_width = th_exceeded * 5,
        )

        fig.show()

In [None]:
df

In [None]:
plot_i2(df, df_i2_s, "speed")

In [None]:
plot_i2(df, df_i2_t, "t")

In [None]:
fig = px.line(
    data_frame = df_i2_s.iloc[-5000:],
    x = "date",
    y = "pv_break_down_event",

    title = "Polar vortex edge detection",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

Custom index 3 (delta index):
 - convolution of data at each end with weighted mean, based on distance (twice, one for wind speed, and one for temperature)
 - get delta of the area as a single value for each metric from the two obainted values
 - or leave the two values as they are

In [None]:
class PV_ind_3():

    def main(df, metric):

        df_i3 = PV_ind_3.agg(df, metric)
        df_i3 = PV_ind_3.set_weights(df_i3)
        df_i3 = PV_ind_3.normalize_metric(df_i3, metric)
        df_i3 = PV_ind_3.set_p_values(df_i3, metric)

        return df_i3

    def agg(df, metric):

        #aggreagte data
        df_i3 = df[["date", "latitude", metric]].groupby(by = ["date", "latitude"], as_index = False).mean()
        #df_i3.rename(mapper = {metric : "metric"}, inplace = True, axis = 1)

        return df_i3

    def normalize_metric(df_i3, metric):

        #get local min and max
        df_i3_minmax = df_i3[["date", metric]]
        df_i3_minmax = df_i3_minmax.groupby(by = "date").aggregate(
            metric_max      = (metric, "max"),
            metric_min      = (metric, "min"),
        )

        #append data together
        df_i3 = df_i3.merge(right = df_i3_minmax, on = "date", how = "left")

        #calculated normalized metric
        df_i3[f"{metric}_norm"] = (df_i3[metric] - df_i3["metric_min"]) / (df_i3["metric_max"] - df_i3["metric_min"])

        #clean up
        df_i3.drop(labels = ["metric_max", "metric_min"], axis = 1, inplace = True)

        return df_i3

    def set_weights(df_i3):

        #transform
        min_lat = float(df_i3["latitude"].min())
        df_i3["weight_n"] = df_i3["latitude"] - min_lat

        #transform to values between 0 and 1
        max_lat = float(df_i3["latitude"].max())

        df_i3["weight_n"] = df_i3["weight_n"] / max_lat
        df_i3["weight_s"] = 1 - df_i3["weight_n"]

        return df_i3

    def set_p_values(df_i3, metric):

        #get p_value
        df_i3["p_north"] = df_i3[f"{metric}_norm"] * df_i3["weight_n"]
        df_i3["p_south"] = df_i3[f"{metric}_norm"] * df_i3["weight_s"]

        #get sum
        df_i3.drop(labels = [metric, "weight_n", "weight_s", f"{metric}_norm", "latitude"], axis = 1, inplace = True)
        df_i3 = df_i3.groupby(by = ["date"], as_index = False).sum()

        #get delta
        df_i3["p_delta"] = df_i3["p_south"] - df_i3["p_north"]

        return df_i3

In [None]:
df_i3 = PV_ind_3.main(df, metric = "t")
df_i3.head(10)

In [None]:
def plot_i3(df, df_i3, plot_param):

    dates = [
        "2019-03-01", "2019-03-02", "2019-03-03", "2019-03-04",
        #"2022-04-01","2022-05-01","2022-06-01",
        #"2022-07-01","2022-08-01","2022-09-01",
        #"2022-10-01","2022-11-01","2022-12-01",
    ]

    if plot_param == "t":
        title = "temperature"
        widht_correction = 2
    elif plot_param == "speed":
        title = "wind speed"
        widht_correction = 4


    for date in dates:

        plot_scaler = 19

        #indicator
        p_delta = round(float(df_i3.loc[df_i3["date"] == date]["p_delta"]),2)

        fig = px.scatter(
            data_frame = df.loc[df["date"] == date],
            x = "latitude",
            y = "level",
            color = plot_param,
            size = "size",
            size_max = 1 * plot_scaler - 3,
            opacity = 1,
            facet_col = "date",
            #animation_frame = "date",

            height = 20 * plot_scaler,
            width = 60 * plot_scaler,
            color_continuous_scale  = plt_style_s,

            title = f"Polar vortex: {title}\nP_delta: {p_delta}",

            labels = {"speed" : "speed [m/s]", "t" : "t [°k]"},
        )

        fig.update_traces(
            marker=dict(symbol="square",),
            selector=dict(mode='markers')
        )

        fig['layout']['yaxis']['autorange'] = "reversed"

        fig.show()

In [None]:
plot_i3(df = df, df_i3 = df_i3, plot_param = "t")

In [None]:
df_i3.head()

In [None]:
fig = px.line(
    data_frame = df_i3.iloc[-5000:],
    x = "date",
    y = "p_delta",

    color_discrete_sequence = plt_style_s,
    title = "Index 3: Temperature",
)

scale_show(fig)

Custom index 4 (local max):
 - get the local max, though all pressure levels

In [None]:
def pv_ind_4(df, metric, agg):
    """
    metric = ["speed", "t"]
    agg = ["sum", "max", "mean", "median"]
    """

    #drop unneded cols
    df_i4 = df[["date", "latitude", "t", "speed"]]

    #crate latitude aggregation
    df_i4 = df_i4.groupby(by = ["date", "latitude"], as_index = False).aggregate(
        speed_agg = ("speed", agg),
        t_agg =    ("t", agg),
    )

    #normalize the standard deviation
    # source: https://business.blogthinkbig.com/warning-about-normalizing-data/

    #get local max for metric and agg
    func = {"speed" : "max", "t" : "min"}
    
    df_i4_max = df_i4[["date",f"{metric}_agg"]]
    df_i4_max = df_i4_max.groupby(by = ["date"], as_index = False).aggregate(
        metric_max = (f"{metric}_agg", func[metric]),
    )

    #append data together
    df_i4["date"] = df_i4["date"].astype(str)
    df_i4_max["date"] = df_i4_max["date"].astype(str)

    df_i4["date"] = pd.to_datetime(df_i4["date"])
    df_i4_max["date"] = pd.to_datetime(df_i4_max["date"])

    df_i4 = df_i4.merge(right = df_i4_max, on = ["date"], how = "left")
    
    #only get matchin max value
    df_i4 = df_i4.loc[df_i4["metric_max"] == df_i4[f"{metric}_agg"]]
    df_i4.drop(labels = "metric_max", axis = 1, inplace = True)

    return df_i4


In [None]:
df_i4_s = pv_ind_4(df, "speed", "mean")
df_i4_s.head()

In [None]:
plot_i2(df, df_i4_s, "speed")

In [None]:
df_i4_t = pv_ind_4(df, "t", "mean")
df_i4_t.head()

In [None]:
plot_i2(df, df_i4_t, "t")

In [None]:
df_i4_t["type"] = "temp"
df_i4_s["type"] = "speed"

df_i4 = pd.concat(objs = [df_i4_t, df_i4_s])
df_i4.sort_values(by = "date", ascending = True, inplace = True)

df_i4.head()

In [None]:
fig = px.line(
    data_frame  = df_i4.iloc[-2000:],
    x = "date",
    y = "latitude",
    title = "Polar vortex: Index 4",
    color_discrete_sequence = plt_style_s,
    color = "type",
)


scale_show(fig)

In [None]:
#temperature: index 2
#wind speed: index 4

Custom index 5 (SSW)

## 2.3 Index optimization and setting
Sudden stratospheric warming events: https://csl.noaa.gov/groups/csl8/sswcompendium/majorevents.html

### 2.3.1 PVT: Polar vortex temp (index 2)

In [None]:
def plot_i_breakdown(df, plot_param, break_down_date):

    if plot_param == "t":
        title = "temperature"
    elif plot_param == "speed":
        title = "wind speed"

    plot_scaler = 19

    fig = px.scatter(
        data_frame = df,
        x = "latitude",
        y = "level",
        color = plot_param,
        size = "size",
        size_max = 1 * plot_scaler - 1,
        opacity = 1,
        facet_row = "date",

        height = 15 * plot_scaler * len(df["date"].unique().tolist()),
        width = 60 * plot_scaler,
        color_continuous_scale  = plt_style_s,

        title = f"Polar vortex breakdown - {break_down_date}: {title}",

        labels = {"speed" : "speed [m/s]", "t" : "t [k]"},
    )

    fig.update_traces(
        marker=dict(symbol="square",),
        selector=dict(mode='markers')
    )

    fig['layout']['yaxis']['autorange'] = "reversed"

    fig.show()

In [None]:
del df_i2_t

In [None]:
#optimize temperatrue (i2)
#params
n_lat                   = 3 #lat offset for calultion
threshold               = 1.0 #in % for vortex, multiple which has to be exceeded from popluation mean, to be counted as a pv edge
break_down_offset       = 15 #last n span, used to detect a break down
break_down_sensitivity  = 0.1 # in %. Sets the barrier or threshold, at which the value crossing it, a breakdown will be detected

df_i2_t = PV_ind_2_v2.main(
    df = df,
    n_lat = n_lat,
    threshold = threshold,
    break_down_offset = break_down_offset,
    break_down_sensitivity = break_down_sensitivity,
    metric = "t",
)

df_i2_t.head(5)

In [None]:
n = -1000

fig = px.line(
    data_frame = df_i2_t,
    x = "date",
    y = "pv_break_down_event",
    title = "PV break down events i2: Temperature",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)


In [None]:
#add momth
df_i2_t["month"] = pd.DatetimeIndex(df_i2_t["date"]).month


#plot porbaibilty per month
fig = px.histogram(
    data_frame = df_i2_t,
    x = "month",
    y = "pv_break_down_event",
    histfunc = "sum",
    barmode = "stack",
    title = "PV break down events i2 per month: Temperature",
    color_discrete_sequence = plt_style_s,

    width = 700,
    height = 700,
)

fig.show()

In [None]:
#add momth
df_i2_t["month"] = pd.DatetimeIndex(df_i2_t["date"]).month


#plot porbaibilty per month
fig = px.histogram(
    data_frame = df_i2_t,
    x = "month",
    y = "pv_edge",
    histfunc = "sum",
    barmode = "stack",
    title = "PV edges i2: Temperature",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
#plot breakdonw events
df_i2_t.reset_index(inplace = True, drop = True)

plot_range = 5

df_i2_t["date"] = df_i2_t["date"].astype(str) #whyyy?

for i in df_i2_t.loc[df_i2_t["pv_break_down_event"] == 1].index[-4:]:
    for param in ["t", "speed"]:

        print(i)
        break_down_date = df_i2_t.iloc[i]["date"]

        offset = int(plot_range / 2)
        start_date      = df_i2_t.iloc[i - offset]["date"]
        end_date        = df_i2_t.iloc[i + offset]["date"]


        df_plot = df.loc[(df["date"] >= start_date) & (df["date"] <= end_date)]
        plot_i_breakdown(df = df_plot, plot_param = param, break_down_date = break_down_date)

In [None]:
#prepare data frame for merging
unneded_cols = ["speed_mean", "delta", "mean_delta", "month"]

for col in unneded_cols:

    try:
        df_i2_t.drop(labels = col, axis = 1, inplace = True)
    except:
        print(f"{col} does not exist")

#add prefix to generate unique values
df_i2_t = df_i2_t.add_prefix("pvt_") #polar vortex temperature

df_i2_t.head()

### 2.3.2 PVS: Polar vortex wind speed (index 4)

In [None]:
class PV_ind_4():

    def main(df, metric, threshold, break_down_offset, break_down_sensitivity):

        df_i4 = PV_ind_4.get_local_max(df, metric)
        df_i4 = PV_ind_4.apply_threshold(df_i4, df, threshold, metric)
        df_i4 = PV_ind_4.detect_breakdown(df_i4, break_down_offset, break_down_sensitivity)
        
        return df_i4

    def get_local_max(df, metric):

        #drop unneded cols
        df_i4 = df[["date", "latitude", "t", "speed"]]

        #crate latitude aggregation
        df_i4 = df_i4.groupby(by = ["date", "latitude"], as_index = False).aggregate(
            speed_mean = ("speed", "mean"),
            t_mean =    ("t", "mean"),
        )

        #get local max for metric and agg
        func = {"speed" : "max", "t" : "min"}
        
        df_i4_max = df_i4[["date",f"{metric}_mean"]]
        df_i4_max = df_i4_max.groupby(by = ["date"], as_index = False).aggregate(
            metric_max = (f"{metric}_mean", func[metric]),
        )

        #append data together
        df_i4["date"] = df_i4["date"].astype(str)
        df_i4_max["date"] = df_i4_max["date"].astype(str)

        df_i4["date"] = pd.to_datetime(df_i4["date"])
        df_i4_max["date"] = pd.to_datetime(df_i4_max["date"])

        df_i4 = df_i4.merge(right = df_i4_max, on = ["date"], how = "left")
        
        #only get matching max value
        df_i4 = df_i4.loc[df_i4["metric_max"] == df_i4[f"{metric}_mean"]]
        df_i4.drop(labels = "metric_max", axis = 1, inplace = True)

        #drop unneded metric
        unneded_metric = ["t", "speed"]
        unneded_metric.remove(metric)
        df_i4.drop(labels = f"{unneded_metric[0]}_mean", axis = 1, inplace = True)

        return df_i4

    def apply_threshold(df_i4, df, threshold, metric):

        #only get needed cols
        df_i4_mean = df[["date", metric]]

        #set th as multiplicator
        th = threshold + 1

        #get overall mean metric
        df_i4_mean = df_i4_mean.groupby(by = "date", as_index = False).mean()

        #stupid date time formats keep changing
        df_i4_mean['date'] = df_i4_mean['date'].astype('datetime64[ns]')

        #merge and apply th
        df_i4 = df_i4.merge(right = df_i4_mean, on = "date", how = "left")
        df_i4[metric] = df_i4[metric] * th

        #check if th is exceeded
        df_i4["pv_edge"] = 0
        df_i4.loc[df_i4[f"{metric}_mean"] > df_i4[metric], "pv_edge"] = 1

        #drop unnded cols
        df_i4.drop(labels = [metric], inplace = True, axis = 1)

        return df_i4

    def detect_breakdown(df_i4, break_down_offset, break_down_sensitivity):

        #relevant months
        considered_months = [12,1,2,3,4]

        #consider offset by applying rolling mean.
        df_i4["pv_edge_offset"] = df_i4["pv_edge"].rolling(break_down_offset).mean()

        #break down event
        df_i4["pv_break_down_event"] = 0
        df_i4.loc[
            (df_i4["pv_edge_offset"] <= break_down_sensitivity) &
            (df_i4["pv_edge_offset"].shift(1) > break_down_sensitivity) &
            (df_i4["date"].dt.month.isin(considered_months))
            , "pv_break_down_event"] = 1

        #clean up
        df_i4.drop(labels = "pv_edge_offset", axis = 1, inplace = True)

        return df_i4


In [None]:
threshold               = 0.5
break_down_offset       = 60 #last n span, used to detect a break down
break_down_sensitivity  = 0.875 # in %. Sets the barrier or threshold, at which the value crossing it, a breakdown will be detected

df_i4_s = PV_ind_4.main(
    df = df,
    metric = "speed",

    threshold = threshold,
    break_down_offset = break_down_offset,
    break_down_sensitivity = break_down_sensitivity,
)

In [None]:
df_i4_s.head()

In [None]:
n = -1000

fig = px.line(
    data_frame = df_i4_s,
    x = "date",
    y = "pv_break_down_event",
    title = "PV break down events i4: Wind speed",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
#add momth
df_i4_s["month"] = pd.DatetimeIndex(df_i4_s["date"]).month


#plot porbaibilty per month
fig = px.histogram(
    data_frame = df_i4_s,
    x = "month",
    y = "pv_break_down_event",
    histfunc = "sum",
    barmode = "stack",
    title = "PV break down events i4 per month: Wind speed",
    color_discrete_sequence = plt_style_s,

    height = 700,
    width = 700,
)

fig.show()

In [None]:
#add momth
df_i4_s["month"] = pd.DatetimeIndex(df_i4_s["date"]).month


#plot porbaibilty per month
fig = px.histogram(
    data_frame = df_i4_s,
    x = "month",
    y = "pv_edge",
    histfunc = "sum",
    barmode = "stack",
    title = "PV edges i4: Temperature",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
df_i4_s.info()

In [None]:
#plot breakdonw events
df_i4_s.reset_index(inplace = True, drop = True)
df_i4_s["date"] = df_i4_s["date"].astype(str)

plot_range = 5

for i in df_i4_s.loc[df_i4_s["pv_break_down_event"] == 1].index[-4:]:
    for param in ["t", "speed"]:

        print(i)
        break_down_date = "2022-03-05" #df_i4_s.iloc[i]["date"]

        offset = int(plot_range / 2)
        start_date      = "2022-03-03"#df_i4_s.iloc[i - offset]["date"]
        end_date        = "2022-03-07"#df_i4_s.iloc[i + offset]["date"]

        df_plot = df.loc[(df["date"] >= start_date) & (df["date"] <= end_date)]
        plot_i_breakdown(df = df_plot, plot_param = param, break_down_date = break_down_date)

In [None]:
#remove unneded cols
try:
    df_i4_s.drop(labels = "month", axis = 1, inplace = True)
except:
    print("month col does not exist")

#add prefix
df_i4_s = df_i4_s.add_prefix("pvs_")

#check
df_i4_s.head()

### 2.3.3 Merge and compare

In [None]:
#merge data
df_pv = df_i2_t.merge(right = df_i4_s, left_on = "pvt_date", right_on = "pvs_date", how = "left")

#clean up
df_pv.drop(labels = "pvs_date", axis = 1, inplace = True)
df_pv.rename(mapper = {"pvt_date" : "date"}, axis = 1, inplace = True)

df_pv.head()

In [None]:
#some plots and correlation matrix
df_pv_plot = df_pv.copy()

df_pv_plot["size"] = 1


fig = px.scatter(
    data_frame = df_pv_plot,
    x = "pvs_latitude",
    y = "pvt_latitude",
    title = "Temperature and wind speed comparison: latitude",
    color_discrete_sequence = plt_style_s,
    opacity = 0.05,
    size_max = 17,
    size = "size",

    height = 1000,
    width = 1000,

)

fig.update_traces(
    marker=dict(symbol="square",),
    selector=dict(mode='markers')
)


fig.show()

In [None]:
fig = px.line(
    data_frame = df_pv_plot,
    x = "date",
    y = ["pvs_pv_break_down_event", "pvt_pv_break_down_event"],

    title = "Break down events",
    color_discrete_sequence = plt_style_s,

    labels = {"value" : ""}
)

newnames = {"pvs_pv_break_down_event" : "pvs breakdwon", "pvt_pv_break_down_event" : "pvt breakdown"}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )

scale_show(fig)

In [None]:
fig = px.line(
    data_frame = df_pv_plot.iloc[-2000:],
    x = "date",
    y = ["pvs_pv_edge", "pvt_pv_edge"],

    title = "PV edges",
    color_discrete_sequence = plt_style_s,

    labels = {"value" : ""}
)

newnames = {"pvs_pv_edge" : "pvs edge", "pvt_pv_edge" : "pvt edge"}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )

scale_show(fig)

In [None]:
df_pv

In [None]:
#merge with main dataframe
df_fe = pd.read_csv(os.path.join(data_folder, "df_fe.csv"))
df_fe.head(2)

In [None]:
print(f"df_fe: {df_fe.shape}\ndf_pv: {df_pv.shape}")

In [None]:
df_fe = df_fe.merge(right = df_pv, left_on = "date", right_on = "date")


In [None]:
df_fe.head().T

In [None]:
#save main data frame with added pv data
if save_data is True:
    df_fe.set_index("date", drop = True, inplace = True)
    df_fe.to_csv(os.path.join(data_folder, "df_fe.csv"))

## 2.4 Categorizing data for ml models

In [None]:
df = pd.read_csv(os.path.join(data_folder, "df_fe.csv"))
df["date"] = pd.DataFrame(df["date"])
df.head().T

In [None]:
for month in df["month"].unique()[:3]:

    fig = px.box(
        data_frame = df.loc[(df["month"] == month)],
        x = "day",
        y = "t2m",

        title = f"Temperature by month: {month}",
        color_discrete_sequence = plt_style_s,

        width = size["width"],
        height = size["height"] * 12,
    )

    scale_show(fig)


In [None]:
#calculate fortnight periods 1 and 2
df["t2m_t1"] = df["t2m"].rolling(14).mean().shift(-14)
df["t2m_t2"] = df["t2m"].rolling(14).mean().shift(-28)

#set category
df_median = df[["month", "day", "t2m_t1", "t2m_t2"]].groupby(by = ["month", "day"], as_index = False).aggregate(
    t2m_t1_mean = ("t2m_t1", "mean"),
    t2m_t2_mean = ("t2m_t2", "mean"),
)

#merge df to main
df = df.merge(right = df_median, on = ["month", "day"])
df.sort_values(by = "date", inplace = True, ascending = True)

#set, if value is above or below median
df["t2m_t1_cat"] = 0
df["t2m_t2_cat"] = 0

df.loc[df["t2m_t1"] >= df["t2m_t1_mean"], "t2m_t1_cat"] = 1
df.loc[df["t2m_t2"] >= df["t2m_t2_mean"], "t2m_t2_cat"] = 1

#see df
df.head(15).T

In [None]:
fig = px.histogram(
    data_frame = df,
    x = "year",
    y  = "t2m_t1_cat",
    color = "t2m_t1_cat",
    color_discrete_sequence = plt_style_s,
    title = "Target variable distribution",
    histfunc = "count",
    barmode= "stack"
)

scale_show(fig)

In [None]:
df_target = df[["t2m_t1_cat", "t2m_t2_cat"]]
df_target["same_cat"] = df_target["t2m_t1_cat"] == df_target["t2m_t2_cat"]

fig = px.histogram(
    data_frame = df_target,
    x = "same_cat",
    color = "same_cat",
    histfunc = "count",
    title = "Same category in target vector (t1 = t2)",
    color_discrete_sequence = plt_style_s,

    width = 500,
    height = 500,
)

fig.add_hline(
    y = len(df_target.index.tolist()) / 2,
    line_width=3,
    line_dash="dash",
    line_color="grey",

    annotation_text = "q = 0.5",
    annotation_position="right",
    annotation_font_color = "black",
)

fig.update_layout(showlegend=False)

fig.show()

In [None]:
df_median

In [None]:
fig = px.scatter(
    data_frame = df_median,
    x = df_median.index,
    y = "t2m_t1_mean",

    title = "Target: t2m_mean",
    color_continuous_scale = plt_style_c,
    labels = {"t2m_t1_mean" : "t2m_mean [k]"},

    color = "month",
)

scale_show(fig)

In [None]:
if save_data is True:

    try:
        df.set_index(keys = "date", inplace = True)
    except:
        pass

    df.to_csv(os.path.join(data_folder, "df_main.csv"))

In [None]:
#outlier analysis
df_median.iloc[59]

In [None]:
df.loc[(df["day"].isin([27,28,29])) & (df["month"] == 2)][["t2m", "sp", "cdir"]].T

In [None]:
#the outlier is neglecgtable

## 2.5 t2m and pv_breakdwon correlations

In [None]:
df_comp = df[["date", "t2m", "pvs_latitude", "pvt_latitude", "pvt_pv_break_down_event", "pvs_pv_break_down_event", "t2m_t1_cat", "pvs_pv_edge", "pvt_pv_edge"]]
df_comp.head()

In [None]:
# Correlation
df_corr = df_comp.corr().round(1)

# Mask to matrix
mask = np.zeros_like(df_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Viz
df_corr_viz = df_corr.mask(mask).dropna(how='all').dropna('columns', how='all')

fig = px.imshow(

    df_corr_viz,
    text_auto=True,
    color_continuous_scale = plt_style_c,

    title = "Correlation matrix",
    width = 700,
    height = 700,
    )

fig.show()

del df_corr

In [None]:
fig = px.scatter(
    data_frame = df_comp,
    y = "t2m",
    x = "pvs_latitude",
    color = "pvs_pv_break_down_event",
    range_color = [0,1],
    opacity = 0.1,

    color_continuous_scale = plt_style_s,
    facet_col = "pvs_pv_edge",

    title = "t2m and pvs latitude correlation",
    labels = {"t2m" : "t2m [k]", "pvs_pv_break_down_event" : "breakdown"},
    trendline = "ols",
    trendline_color_override = "red",

)

fig.add_hline(
    y = 273.15,
    line_width=3,
    line_dash="dash",
    line_color="dark blue",

    annotation_text = "273.15",
    annotation_position="bottom left",
    annotation_font_color = "black",
)

scale_show(fig)

In [None]:
fig = px.box(
    data_frame = df_comp,
    y = "t2m",
    x = "pvs_latitude",

    title = "t2m and pvs latitude correlation",
    labels = {"t2m" : "t2m °k"},
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
fig = px.scatter(
    data_frame = df_comp,
    y = "t2m",
    x = "pvt_latitude",
    color = "pvt_pv_break_down_event",
        range_color = [0,1],
    opacity = 0.1,

    color_continuous_scale = plt_style_s,
    facet_col = "pvt_pv_edge",

    title = "t2m and pvt latitude correlation",
    labels = {"t2m" : "t2m [k]", "pvt_pv_break_down_event" : "breakdown"},
    trendline = "ols",
    trendline_color_override = "red",
)

fig.add_hline(
    y = 273.15,
    line_width=3,
    line_dash="dash",
    line_color="dark blue",

    annotation_text = "273.15",
    annotation_position="bottom left",
    annotation_font_color = "black",
)


scale_show(fig)

In [None]:
n = 7000

pvs_breakdown_list = df_comp.iloc[-n:].loc[df_comp["pvs_pv_break_down_event"] == 1]["date"].to_list()

fig = px.scatter(
    data_frame = df_comp.iloc[-n:],
    x = "date",
    y = "t2m",
    color = "t2m_t1_cat",
    color_continuous_scale = plt_style_s,
    title = "PVS break down events",
)

for date in pvs_breakdown_list:
    fig.add_vline(
        x = date,
        
    )


scale_show(fig)

In [None]:
n = 7000

pvt_breakdown_list = df_comp.iloc[-n:].loc[df_comp["pvt_pv_break_down_event"] == 1]["date"].to_list()

fig = px.scatter(
    data_frame = df_comp.iloc[-n:],
    x = "date",
    y = "t2m",
    color = "t2m_t1_cat",
    color_continuous_scale = plt_style_s,
    title = "PVT break down events",
)

for date in pvt_breakdown_list:
    fig.add_vline(
        x = date,
    )


scale_show(fig)

In [None]:
#comparison with ssw data
#source: https://csl.noaa.gov/groups/csl8/sswcompendium/majorevents.html

ssw_events = [
    "1958-01-01","1958-11-01","1960-01-01","1963-01-01","1965-03-01","1965-12-01","1966-02-01","1968-01-01","1968-11-01","1969-03-01","1970-01-01","1971-01-01","1971-03-01","1973-01-01","1977-01-01","1979-02-01","1980-02-01","1981-02-01","1981-03-01","1981-12-01","1984-02-01","1985-01-01","1987-01-01","1987-12-01","1988-03-01","1989-02-01","1998-12-01","1999-02-01","2000-03-01","2001-02-01","2001-12-01","2002-02-01","2003-01-01","2004-01-01","2006-01-01","2007-02-01","2008-02-01","2009-01-01","2010-02-01","2010-03-01","2013-01-01","2018-02-01","2019-01-01",
]


df_comp["ssw"] = 0
df_comp.loc[df_comp["date"].isin(ssw_events), "ssw"] = 1

In [None]:
n = 5000
pvs_breakdown_list = df_comp.iloc[-n:].loc[df_comp["ssw"] == 1]["date"].to_list()

for col in ["ssw", "pvt_pv_break_down_event", "pvs_pv_break_down_event"]:

    bd_list = df_comp.iloc[-n:].loc[df_comp[col] == 1]["date"].to_list()


    fig = px.scatter(
        data_frame = df_comp.iloc[-n:],
        x = "date",
        y = df_comp.iloc[-n:]["t2m_t1_cat"].rolling(60).mean(),
        color = "t2m_t1_cat",
        color_continuous_scale =  plt_style_s,
        title = f"Break down event: {col}",

        labels = {"t2m" : "t2m [k]"}
    )

    for date in bd_list:
        fig.add_vline(
            x = date,
    )


    scale_show(fig)

In [None]:
df_comp

In [None]:
fig = px.line(
    data_frame = df_comp,
    x = "date",
    y = ["pvs_pv_break_down_event", "ssw"],

    title = "Polar vortex break down event (pvs) and SSW comparison",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
fig = px.line(
    data_frame = df_comp,
    x = "date",
    y = ["pvt_pv_break_down_event","pvs_pv_break_down_event" ,"ssw"],

    title = "Polar vortex break down event (pvt, pvs) and SSW comparison",
    color_discrete_sequence = plt_style_s,

    labels = {"value" : "event", "pvt_pv_break_down_event" : "pvt","pvs_pv_break_down_event" : "pvs"},
)

scale_show(fig)

In [None]:
df_comp[["pvt_pv_break_down_event", "pvs_pv_break_down_event", "ssw"]].sum()

In [None]:
#del df_comp