# 0. Data gathering
All data will be aggreagte to daily for furhter aggregation and feature engineering

In [None]:
#imports
import pandas as pd
import os
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime

#folders
data_folder = "data"
#data_folder = os.path.join("D:","bthe_downloads")

In [None]:
#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#defualt plot size 
size = {
    "width" : 1500 ,
    "height" : 750 ,
}

#function for plotting
def scale_show(fig):

    #set font
    fig.update_layout(
        font = dict(size=16),
        title_font = dict(size=20),
        xaxis_title_font = dict(size=18),
        yaxis_title_font = dict(size=18),
    )

    #set size
    fig.update_layout(
        width=1500,
        height=750,
    )

    #show
    fig.show()

    return

In [None]:
#only used for initial data downliading and parsing

try:
    from unittest import result
    import cdsapi #additional file needed to run. See docu
    import requests
    import xarray as xr
except:
    print("libs import failed. Not needed, unless th era5 data is to be downloaded anew and recompile the .nc to .csv")

In [None]:
#general parsing and aggregating of files
save_data                   = False

#chapter 0.1
t2_run_era5_download        = False
t2m_compile_df              = False

#chapter 0.2
soi_run_era5_download       = False
soi_compile_df              = False

#chapter 0.5
pv_run_era5_download        = False
pv_compile_df               = False

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#custom and dynamic aggregation funciton

def dynamic_aggregation(df, grouping_col):
    """change to daily intervall"""

    #get mean and std
    df_mean = df.groupby([grouping_col], as_index = True).mean()
    df_std = df.groupby([grouping_col], as_index = True).std()

    #combine
    df = df_mean.join(other = df_std, lsuffix="_mean", rsuffix='_std')
    df = df.round(2)

    return df

## 0.1 General weather and temperature data (t2m)
source: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels?tab=overview

In [None]:
#define borders
north   = 47.8
east    = 10.5
south   = 45.8
west    = 6.0

lons = [west, west, east, east, west]
lats = [south, north, north, south, south]

#lons = [6,6,10.5,10.5,6]
#lats = [45.8,47.8,47.8,45.8,45.8]

#create plot 
fig = go.Figure(go.Scattermapbox(
    mode = "markers+lines",
    lon = lons,
    lat = lats,
    marker = {'size': 10})
)

#adjust view
fig.update_layout(
    margin ={'l':0,'t':0,'b':0,'r':0},
    mapbox = {
        'center': {'lon': 8.4, 'lat': 46.85},
        'style': "carto-positron",
        'zoom': 5})


scale_show(fig)

In [None]:
#data source: https://cds.climate.copernicus.eu/cdsapp#!/home

class Wrapper():

    #class variables
    folder_name : str = None

    #functionality
    def main(all_vars : bool, data_folder):

        Wrapper.folder_name = os.path.join(data_folder,"raw_t2m")

        start_year : int        = 1979 #1979
        end_year : int          = 2023 #2023

        years : list            = Wrapper.generate_year_list(start = start_year, end = end_year)
        variables : list        = Wrapper.generate_var_list(all = all_vars)

        #main loop for downloading data
        Wrapper.log(f"Downloading startet for range: {start_year} - {end_year}")

        for year in years:
            
            print(f"Processing {year}")
            result : str = Wrapper.request(year, variables, all_vars)

        #tranforms and saves data as a csv for later processing in pandas
        Wrapper.generate_df()

        return

    def generate_year_list(start:int, end:int):

        year_list_str : list = [str(year) for year in range(start,end)]
        return year_list_str

    def generate_var_list( all : bool):

        if all == True:
            return [
                '10m_u_component_of_wind',
                '10m_v_component_of_wind',
                '2m_temperature',
                'clear_sky_direct_solar_radiation_at_surface',
                'surface_pressure',]
        else:
            return ['2m_temperature']

    def generate_df():

        downloads = Wrapper.folder_name
        files = os.listdir(downloads)
        files = [file for file in files if file[-3:] == ".nc"]

        for file in files:

            #open .nc files
            file = os.path.join(downloads,file)

            ds = xr.open_dataset(file)
            df = ds.to_dataframe()

            #save df
            file = os.path.basename(file)
            name = f"{file[:-3]}.csv"
            path = os.path.join(Wrapper.folder_name,name)
            df.to_csv(path)

    def download_data(result:str, year:str, all_vars:bool):
        """[deprecated]"""

        #genearte download and saving path
        path : str          = Wrapper.download_path()
        file_name : str     = f"t2m_{year}_allvars_{all_vars}.nc" #type nasCat data
        file_path :str      = os.path.join(path,file_name)
        print(file_path)

        #get download link
        try:
            link_start : int    = result.index("location=") + len ("location=")
            url : str           = result[link_start:-1]
        except:
            Wrapper.log(f"{year}: The api response does not contain a  download link")
            return

        #retrieve data from web page and save it
        try:
            response = requests.get(url)
        except:
            Wrapper.log(f"{year}: The download url is not valid")
            return

        open(file_path, "wb").write(response.content)

        return

    def download_path():

        folder_name = Wrapper.folder_name

        if os.path.isdir(folder_name) == False:
            os.makedirs(folder_name)

        download_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), folder_name)

        return download_path

    def file_path(year, all_vars):

        #genearte download and saving path
        path : str          = Wrapper.folder_name
        file_name : str     = f"t2m_{year}_allvars_{all_vars}.nc" #type nasCat data
        file_path :str      = os.path.join(path,file_name)

        print(file_path)
        return file_path

    def log(message : str):

        #create log entry
        log_time : str = datetime.now()
        message = f"{log_time},{message}\n"

        #write log entry
        file_object = open('era5_log.txt', 'a')
        file_object.write(message)
        file_object.close()

        return

    def request(year:list, variable:list, all_vars:bool):
        # see: https://www.latlong.net/

        c = cdsapi.Client()
        file_path = Wrapper.file_path(year = year, all_vars = all_vars)

        request = c.retrieve(
            'reanalysis-era5-single-levels',
            {
                'product_type': 'reanalysis',
                'variable': variable,
                'year': year,
                'month': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                ],
                'day': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                    '13', '14', '15',
                    '16', '17', '18',
                    '19', '20', '21',
                    '22', '23', '24',
                    '25', '26', '27',
                    '28', '29', '30',
                    '31',
                ],
                'time': [
                    '00:00', '01:00', '02:00',
                    '03:00', '04:00', '05:00',
                    '06:00', '07:00', '08:00',
                    '09:00', '10:00', '11:00',
                    '12:00', '13:00', '14:00',
                    '15:00', '16:00', '17:00',
                    '18:00', '19:00', '20:00',
                    '21:00', '22:00', '23:00',
                ],
                'area': [47.8, 6, 45.8,10.5,],
                'format': 'netcdf',
            },
            file_path
        )

        return str(request)

if t2_run_era5_download is True:
    Wrapper.main(all_vars = True, data_folder = data_folder)

In [None]:
#aggregates to weekly interval data

class Agg_t2m():

    data = None
    t2m_data :str = None

    def main(data_folder):

        Agg_t2m.data = data_folder
        Agg_t2m.t2m_data = os.path.join(data_folder,"raw_t2m")

        files : list        = os.listdir(Agg_t2m.t2m_data)
        csv_files : list    = [file for file in files if file[-4:] == ".csv"]

        dfs : list          = []

        for csv in csv_files:

            print(f"Processing: {csv}", end = "\r")
            df = Agg_t2m.lonlat_mean(csv = csv)
            dfs.append(df)

        df = Agg_t2m.merge(dfs)
        df = Agg_t2m.aggregate(df)
        Agg_t2m.save(df, data_folder)

        del dfs, df
        return

    def lonlat_mean(csv : str):

        #average over lon and lattitude
        df = pd.read_csv(os.path.join(Agg_t2m.t2m_data, csv))
        df = df.drop(labels = ["longitude","latitude"], axis = 1)
        df_avg_lon_lat = df.groupby("time").mean()

        del df #free up memory
        return df_avg_lon_lat

    def aggregate(df):

        #generate gorup index as date
        df.reset_index(drop = False, inplace = True)
        df["time"] = pd.to_datetime(df["time"])
        df["date"] = df["time"].dt.date

        #set date as index
        df.set_index("date", drop = True, inplace = True)

        #drop time column
        df.drop(labels = ["time"], inplace = True, axis = 1)

        #aggreagte data
        df = df.groupby(["date"], as_index = True).mean()

        return df

    def merge(dfs):

        df = pd.concat(dfs)
        #df.set_index(keys = "date", inplace = True)
        df.sort_index(inplace = True)

        return df

    def save(df, data_folder):

        df.to_csv(os.path.join(data_folder, "df_t2m.csv"))
        return

if t2m_compile_df is True:
    Agg_t2m.main(data_folder)

In [None]:
df_t2m = pd.read_csv(os.path.join(data_folder, "df_t2m.csv"), index_col = "date")
df_t2m.head()

In [None]:
df_t2m.tail()

## 0.2 Southern oscilation index (SOI)
source:
- http://www.bom.gov.au/climate/mjo/
- https://www.climate.gov/news-features/understanding-climate/climate-variability-southern-oscillation-index

formula:
- http://www.bom.gov.au/climate/glossary/soi.shtml
- https://www.ncei.noaa.gov/access/monitoring/enso/soi


The Southern Oscillation Index (SOI) is calculated using the atmospheric pressure difference between Tahiti and Darwin, Australia. The most common approach is to use monthly mean sea level pressure values for these two locations, which are then standardized and combined to create the SOI.

- Tahiti: 17.5°S / 149.5°W
- Darwin,  12.5°S / 131.5°E

or

- Tahiti (https://www.latlong.net/place/papeete-french-polynesia-30701.html):
    - lat = -17.53
    - lon = -149.56
- Darwin (https://www.latlong.net/place/darwin-northern-territory-australia-5517.html):
    - lat = -12.46
    - lon = 130.84

Here is a source for more information on the calculation of the SOI:
https://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/soi.html


In [None]:
lonlat_data = {
    "place"     : ["tahiti",    "darwin"],
    "lat"       : [-17.53,        -12.46],
    "lon"       : [-149.56,       130.84],
    "size"      : [1, 1],
}

df = pd.DataFrame(lonlat_data)

fig = px.scatter_mapbox(
    data_frame= df,
    lat="lat",
    lon="lon",
    size = "size",
    hover_name = "place",
    size_max = 15,
    color_continuous_scale = plt_style_s,
)


#adjust view
fig.update_layout(
    margin ={'l':0,'t':0,'b':0,'r':0},
    mapbox = {
        'center': {'lon': 178, 'lat':-18},
        'style': "carto-positron",
        'zoom': 2})

scale_show(fig)

In [None]:
#lat = north / south
#lon = east / west
df

In [None]:
#fetch the sea surface preassure data from copernicus

class Wrapper():

    #class variables
    folder_name : str = None

    #functionality
    def main(data_folder):

        Wrapper.folder_name = os.path.join(data_folder,"raw_soi")

        start_year : int        = 1979 #1979
        end_year : int          = 2023 #2024

        years : list            = Wrapper.generate_year_list(start = start_year, end = end_year)
        variables : list        = Wrapper.generate_var_list()

        #set the two cites as previuously defined
        cities = {
            "tahiti" : [-17.50, -149.60, -17.50, -149.55,],
            "darwin" : [-12.45, 130.80, -12.50, 130.85,],
            }

        #main loop for downloading data
        Wrapper.log(f"Downloading startet for range: {start_year} - {end_year}")

        for year in years:
            for key in cities:
                print(f"Processing {year}, {key}")
                result : str = Wrapper.request(year = year, variable = variables, city = key , area = cities[key])

        #tranforms and saves data as a csv for later processing in pandas
        Wrapper.generate_df()

        return

    def generate_year_list(start:int, end:int):

        year_list_str : list = [str(year) for year in range(start,end)]
        return year_list_str

    def generate_var_list():

        return ["mean_sea_level_pressure"]

    def generate_df():

        downloads = Wrapper.folder_name
        files = os.listdir(downloads)
        files = [file for file in files if file[-3:] == ".nc"]

        for file in files:

            #open .nc files
            file = os.path.join(downloads,file)
            ds = xr.open_dataset(file)
            df = ds.to_dataframe()

            #save df
            file = os.path.basename(file)
            name = f"{file[:-3]}.csv"
            path = os.path.join(Wrapper.folder_name,name)
            df.to_csv(path)

    def download_path():

        folder_name = Wrapper.folder_name

        if os.path.isdir(folder_name) == False:
            os.makedirs(folder_name)

        download_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), folder_name)

        return download_path

    def file_path(year, city):

        #genearte download and saving path
        path : str          = Wrapper.folder_name
        file_name : str     = f"soi_{year}_{city}.nc" #type nasCat data
        file_path :str      = os.path.join(path,file_name)

        return file_path

    def log(message : str):

        #create log entry
        log_time : str = datetime.now()
        message = f"{log_time},{message}\n"

        #write log entry
        file_object = open('era5_log.txt', 'a')
        file_object.write(message)
        file_object.close()

        return

    def request(year:list, variable:list, area : list, city : str):
        # see: https://www.latlong.net/

        c = cdsapi.Client()
        file_path = Wrapper.file_path(year = year, city = city)

        request = c.retrieve(
            'reanalysis-era5-single-levels',
            {
                'product_type': 'reanalysis',
                'format': 'netcdf',
                'variable': variable,
                'area': area,
                'time': [
                    '00:00', '01:00', '02:00',
                    '03:00', '04:00', '05:00',
                    '06:00', '07:00', '08:00',
                    '09:00', '10:00', '11:00',
                    '12:00', '13:00', '14:00',
                    '15:00', '16:00', '17:00',
                    '18:00', '19:00', '20:00',
                    '21:00', '22:00', '23:00',
                ],
                'day': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                    '13', '14', '15',
                    '16', '17', '18',
                    '19', '20', '21',
                    '22', '23', '24',
                    '25', '26', '27',
                    '28', '29', '30',
                    '31',
                ],
                'month': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                ],
                'year': year,
            },
            file_path
        )

        return str(request)

if soi_run_era5_download is True:
    Wrapper.main(data_folder = data_folder)

In [None]:
class Compile_soi():

    data = None
    t2m_data :str = None

    def main(data_folder):

        Compile_soi.data = data_folder
        Compile_soi.t2m_data = os.path.join(data_folder,"raw_soi")

        files : list        = os.listdir(Compile_soi.t2m_data)

        csv_files_tahiti : list    = [file for file in files if file[-4:] == ".csv" and "tahiti" in file]
        csv_files_darwin : list    = [file for file in files if file[-4:] == ".csv" and "darwin" in file]

        dfs : list          = []

        for csv_tahiti, csv_darwin in zip(csv_files_tahiti, csv_files_darwin):
            print(f"processing:{csv_tahiti}, {csv_darwin}")

            #remove lon lat dependency
            df_tahiti = Compile_soi.lonlat_mean(csv = csv_tahiti)
            df_darwin = Compile_soi.lonlat_mean(csv = csv_darwin)

            #combine the dataframes
            df = Compile_soi.combine(df_tahiti = df_tahiti, df_darwin = df_darwin) #also aggreagte the data to daily interval
            del df_tahiti, df_darwin

            dfs.append(df)

        df = Compile_soi.merge(dfs)
        df = Compile_soi.calculate_soi(df)
        Compile_soi.save(df, Compile_soi.data)

        return

    def lonlat_mean(csv : str):

        #average over lon and lattitude
        df = pd.read_csv(os.path.join(Compile_soi.t2m_data, csv))
        df = df.drop(labels = ["longitude","latitude"], axis = 1)
        df_avg_lon_lat = df.groupby("time").mean()

        del df #free up memory
        return df_avg_lon_lat

    def combine(df_tahiti, df_darwin):

        #combine into one data frame
        df = df_tahiti.join(other = df_darwin, lsuffix = "_darwin", rsuffix = "_tahiti", on = "time")

        #aggregate
        df = Compile_soi.aggregate(df)

        return df

    def aggregate(df):

        #generate gorup index as date
        df.reset_index(drop = False, inplace = True)
        df["time"] = pd.to_datetime(df["time"])
        df["date"] = df["time"].dt.date

        #set date as index
        df.set_index("date", drop = True, inplace = True)

        #drop time column
        df.drop(labels = ["time"], inplace = True, axis = 1)

        #aggreagte data
        df = df.groupby(["date"], as_index = True).mean()

        return df

    def merge(dfs):

        df = pd.concat(dfs)
        df.sort_index(inplace=True)

        return df

    def calculate_soi(df):
        """sorce of formula: http://www.bom.gov.au/climate/glossary/soi.shtml"""
        # soi = 10 * (p_diff - p_diff_mean) / (p_diff_std)

        #boilerplate
        df["day_index"] = df.index
        df["day_index"] = pd.to_datetime(df.index, errors='coerce')
        df["day_index"] = df["day_index"].dt.strftime('%d-%m')

        #get pressure diff
        df["p_diff"] = df["msl_tahiti"] - df["msl_darwin"]

        #get long term values
        df_long_term = df.groupby(["day_index"], as_index = False).agg(
            p_diff_mean     = ("p_diff", "mean"),
            p_diff_std      = ("p_diff", "std"),
        )

        #delete this later
        df_long_term.fillna(20, inplace = True)

        #join data
        df["day_index"] = df["day_index"].astype("string"); df_long_term["day_index"] = df_long_term["day_index"].astype("string")
        df.reset_index(inplace = True)
        df = pd.merge(df, df_long_term, on = "day_index", how = "left")

        #calculate soi
        df["soi"] = (df["p_diff"] - df["p_diff_mean"]) / df["p_diff_std"]
        df["soi"] = df["soi"] * (-1) #inverse

        #clean up
        df.drop(labels = ["day_index", "p_diff", "msl_tahiti", "msl_darwin", "p_diff_mean", "p_diff_std"], axis = 1, inplace = True)

        #set index
        df.set_index("date", inplace = True, drop = True)

        return df

    def save(df, data_folder):

        df.to_csv(os.path.join(data_folder, "df_soi.csv"))
        return

if soi_compile_df is True:
    Compile_soi.main(data_folder = data_folder)


In [None]:
df_soi = pd.read_csv(os.path.join(data_folder, "df_soi.csv"))
df_soi.head(5)

In [None]:
#complicate, but the normal way throws an unsolveable error
df_soi["year"] = pd.DatetimeIndex(df_soi["date"]).year
df_soi["month"] = pd.DatetimeIndex(df_soi["date"]).month

#get mean
df_soi = df_soi.groupby(["year", "month"], as_index = False).mean()
df_soi["day"] = "01"
df_soi["index"] = pd.to_datetime(df_soi[["year", "month", "day"]])

df_soi.shape

In [None]:
#compare to monthly values of soi by trusted source
#source: https://www.cpc.ncep.noaa.gov/data/indices/soi

#read compare file 
df_enso_raw = pd.read_csv(os.path.join(data_folder, "raw_enso", "raw_enso.csv"))


#new columns
enso_dict : dict = {
    "index" : [],
    "year" : [],
    "month" : [],
    "enso" : [],
}

#iterrate over df to retrieve values
for year in df_enso_raw["year"].to_list():
    for month in df_enso_raw.columns.to_list()[1:]:

        enso : float        = float(df_enso_raw.loc[df_enso_raw["year"] == year][str(month)])
        index : str =       f"{year}-{month}"

        enso_dict["index"].append(index)
        enso_dict["year"].append(int(year))
        enso_dict["month"].append(int(month))
        enso_dict["enso"].append(enso)

#create new df
df_enso = pd.DataFrame(data = enso_dict)
df_enso.set_index("index", drop = True, inplace = True)

#set column
df_enso["day"] = "01"
df_enso["index"] = pd.to_datetime(df_enso[["year", "month", "day"]])

df_enso.shape


In [None]:
#create comparison
df_soi_comp = pd.DataFrame()
df_soi_comp["index"] = df_soi["index"]

df_soi_comp["soi_calculated"] = df_soi["soi"].tolist()
df_soi_comp["soi_source"] = df_enso["enso"].tolist()

In [None]:
#plot for visual check
#finding: they seem to be scaled differntly, but the shape is the same
fig = px.line(
    data_frame = df_soi_comp,
    x = "index",
    y = ["soi_calculated", "soi_source"],
    title = "SOI comparison",
    color_discrete_sequence = plt_style_s
)

scale_show(fig)

In [None]:

df_soi_comp["sacling"]  = df_soi_comp["soi_source"].abs() / df_soi_comp["soi_calculated"].abs()
scale_factor : float = df_soi_comp["sacling"].mean()

df_soi_comp["soi_source"] = df_soi_comp["soi_source"] / scale_factor

In [None]:
fig = px.line(
    data_frame = df_soi_comp,
    x = "index",
    y = ["soi_calculated", "soi_source"],
    title = "SOI comparison scaled",
    color_discrete_sequence = plt_style_s
)

scale_show(fig)


## 0.3 Madden julien oscillation (MJO)
- Source: http://www.bom.gov.au/climate/mjo/
- Note:
    - The data was downloaded and prepared manually as a .csv file (reformating).
    - The datapoint from the year 1978 are missing. A broken up date time series does not make sense. The enso also starts from the year 1979. Therefore the data from year < 1979 is being dropped
    - The aggreagtion will be done in two ways:
        - Values of the last day of month
        - Aggregation:
            - RMM1, RMM2, apmlitude: std, mean
            - phase: mode
- Range: 1979 - 2023

In [None]:
df_mjo_raw = pd.read_csv(os.path.join(data_folder,"raw_mjo","raw_mjo.csv"))
df_mjo_raw.head()

In [None]:
#understanding the mjo data
fig = px.line(
    data_frame = df_mjo_raw.loc[310:390],
    x = "RMM1",
    y = "RMM2",
    color = "month",

    title = "MJO",
    color_discrete_sequence = plt_style_s,
    width = 500,
    height = 500,

    range_x = (3,-3),
    range_y = (3,-3),
)

fig.show()

In [None]:
fig = px.line(
    data_frame = df_mjo_raw,
    y = "amplitude",

    title = "MJO",
    color_discrete_sequence = plt_style_s,
    **size,
)

scale_show(fig)

In [None]:
#drop unneeded cols
df_mjo_raw.drop(axis = 1, labels = ["Unnamed: 0","unnamed", "MissingValue=1.E36or999"], inplace = True)
df_mjo = df_mjo_raw.loc[df_mjo_raw["year"] >= 1979]
df_mjo.head()

In [None]:
fig = px.line(
    data_frame = df_mjo,
    y = "amplitude",

    title = "MJO",
    color_discrete_sequence = plt_style_s,
    **size,
)

scale_show(fig)

In [None]:
df_mjo.describe()

In [None]:
df_mjo.isna().sum()

In [None]:
#clean up and prep data
df_mjo.reset_index(inplace = True, drop = True)
df_mjo['date'] = pd.to_datetime(df_mjo[["year", "month", "day"]])
df_mjo.drop(labels = ["year", "month", "day"], axis = 1, inplace = True)

#set index
df_mjo.set_index("date", drop = True, inplace = True)

#clean up cols
df_mjo = df_mjo.add_prefix(prefix = "mjo_")
df_mjo.columns = [x.lower() for x in df_mjo.columns]

#sort
df_mjo.sort_index(inplace = True)

df_mjo.head()

In [None]:
if save_data is True:
    df_mjo.to_csv(os.path.join(data_folder, "df_mjo.csv"))

## 0.4 Arctic oscillation index (AO)
- source: https://ftp.cpc.ncep.noaa.gov/cwlinks/

In [None]:
df_ao_raw = pd.read_csv(os.path.join(data_folder, "raw_ao", "norm_daily_ao_cda_z1000_19500101_current.csv"))
df_ao_raw.head(10)

In [None]:
fig = px.line(
    data_frame = df_ao_raw,
    y = "ao_index_cdas",

    title = "AO index",
    **size,
    color_discrete_sequence = plt_style_s
)

scale_show(fig)

In [None]:
df_ao = df_ao_raw.rename(mapper = {"ao_index_cdas":"ao"}, inplace = False, axis = 1)
df_ao.describe()

In [None]:
df_ao.isna().sum()

In [None]:
df_ao[df_ao.isna().any(axis=1)]

In [None]:
fig = px.line(
    data_frame = df_ao.loc[(df_ao["year"] == 2003) & (df_ao["month"].isin([4,5]))],
    y = "ao",
    title = "AO 2003",
    **size,
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
#use interpolation to fill na value
df_ao.interpolate(method = "linear", inplace = True)

In [None]:
fig = px.line(
    data_frame = df_ao.loc[(df_ao["year"] == 2003) & (df_ao["month"].isin([4,5]))],
    y = "ao",
    title = "ao 2003",
    **size,
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
df_ao.isna().sum()

In [None]:
df_ao.head(5)

In [None]:
#clean up and prep data
df_ao.reset_index(inplace = True, drop = True)
df_ao["date"] = pd.to_datetime(df_ao[["year", "month", "day"]])
df_ao.drop(labels = ["year", "month", "day"], axis = 1, inplace = True)

#set index
df_ao.set_index("date", drop = True, inplace = True)

#aggregagation not needed

#sort
df_ao.sort_index(inplace = True)

df_ao.head()

In [None]:
if save_data is True:
    df_ao.to_csv(os.path.join(data_folder, "df_ao.csv"))

## 0.5 North atlantic oscilation (NAO)
source: https://www.cpc.ncep.noaa.gov/products/precip/CWlink/pna/nao.shtml

In [None]:
df_nao_raw = pd.read_csv(os.path.join(data_folder, "raw_nao","norm_daily_nao_cdas_z500_19500101_current.csv"))
df_nao_raw.head()

In [None]:
fig = px.line(
    data_frame = df_nao_raw,
    y = "nao_index_cdas",
    title = "NAO",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
df_nao_raw.isna().sum()

In [None]:
df_nao_raw[df_nao_raw.isna().any(axis=1)]

In [None]:
#fill na with interpolation
df_nao = df_nao_raw.interpolate(method = "linear")
df_nao.isna().sum()

In [None]:
#clean up and prep data
df_nao.reset_index(inplace = True, drop = True)
df_nao["date"] = pd.to_datetime(df_nao[["year", "month", "day"]])
df_nao.drop(labels = ["year", "month", "day"], axis = 1, inplace = True)

#set index
df_nao.set_index("date", drop = True, inplace = True)

#aggregagation not needed

#rename col
df_nao.rename(mapper = {"nao_index_cdas" : "nao"}, axis = 1, inplace = True)

#sort
df_nao.sort_index(inplace = True)

df_nao.head()

In [None]:
if save_data is True:
    df_nao.to_csv(os.path.join(data_folder, "df_nao.csv"))

## 0.6 Polar vortex data (PV)

relevant preassure level: 1000 hPa level (see bthe, chapter 2.4 on polar vortex and ao)

In [None]:
#define borders
north   = 90
south   = 45.0
west    = 8.0
east    = 8.0


lons = [west, west, east, east, west]
lats = [south, north, north, south, south]

#lons = [6,6,10.5,10.5,6]
#lats = [45.8,47.8,47.8,45.8,45.8]

#create plot 
fig = go.Figure(go.Scattermapbox(
    mode = "markers+lines",
    lon = lons,
    lat = lats,
    marker = {'size': 10})
)

#adjust view
fig.update_layout(
    margin ={'l':0,'t':0,'b':0,'r':0},
    mapbox = {
        'center': {'lon': 8.4, 'lat': 60},
        'style': "carto-positron",
        'zoom': 3})


scale_show(fig)

In [None]:
#resolution of data
#define borders

lons = []
lats = []

for lat in range(round(south) , round(north)):
    for lon in [8,9]:
        lons.append(lon)
        lats.append(lat)

#lons = [6,6,10.5,10.5,6]
#lats = [45.8,47.8,47.8,45.8,45.8]

print(f"datapoint per time step: {len(lons)}")
print(f"image resolution: {len(set(lons))} x {len(set(lats))}")

#create plot 
fig = go.Figure(go.Scattermapbox(
    mode = "markers",
    lon = lons,
    lat = lats,
    marker = {'size': 10})
)

#adjust view
fig.update_layout(
    margin ={'l':0,'t':0,'b':0,'r':0},
    mapbox = {
        'center': {'lon': 8.4, 'lat': 60},
        'style': "carto-positron",
        'zoom': 3})


scale_show(fig)

In [None]:
#data source: https://cds.climate.copernicus.eu/cdsapp#!/home

class Wrapper():

    #class variables
    folder_name : str = None

    #functionality
    def main(data_folder):

        Wrapper.folder_name = os.path.join(data_folder,"raw_pv")

        start_year : int        = 1979 #1979
        end_year : int          = 2023 #2023

        years : list            = Wrapper.generate_year_list(start = start_year, end = end_year)
        variable : list        = Wrapper.generate_var_list()

        #main loop for downloading data
        Wrapper.log(f"Downloading startet for range: {start_year} - {end_year}")

        for year in years:
            for month in range(1,13):

                print(f"Processing {year}-{month}")

                try:
                    emailer.message(f"Downloading pv: {year}-{month}")
                except:
                    pass

                result : str = Wrapper.request(year = year, month = month ,variable = variable)

        #tranforms and saves data as a csv for later processing in pandas
        Wrapper.generate_df()

        return

    def generate_year_list(start:int, end:int):

        year_list_str : list = [str(year) for year in range(start,end)]
        return year_list_str

    def generate_var_list():

        return ['u_component_of_wind', 'v_component_of_wind','temperature']

    def generate_df():

        downloads = Wrapper.folder_name
        files = os.listdir(downloads)

        for file in files:

            #open .nc files
            file = os.path.join(downloads,file)
            ds = xr.open_dataset(file)
            df = ds.to_dataframe()

            #save df
            file = os.path.basename(file)
            name = f"{file[:-3]}.csv"
            path = os.path.join(Wrapper.folder_name,name)
            df.to_csv(path)

    def download_path():

        folder_name = Wrapper.folder_name

        if os.path.isdir(folder_name) == False:
            os.makedirs(folder_name)

        download_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), folder_name)

        return download_path

    def file_path(year, month):

        #genearte download and saving path
        path : str          = Wrapper.folder_name
        file_name : str     = f"pv_{year}-{month}.nc" #type nasCat data
        file_path :str      = os.path.join(path,file_name)

        print(file_path)
        return file_path

    def log(message : str):

        #create log entry
        log_time : str = datetime.now()
        message = f"{log_time},{message}\n"

        #write log entry
        file_object = open('era5_log.txt', 'a')
        file_object.write(message)
        file_object.close()

        return

    def request(year:int, month : int, variable:list):
        # see: https://www.latlong.net/

        c = cdsapi.Client()
        file_path = Wrapper.file_path(year = year, month = month)

        print(year)

        request = c.retrieve(
            'reanalysis-era5-pressure-levels',
            {
                'product_type': 'reanalysis',
                'format': 'netcdf',
                'pressure_level': ['10', '20', '30' ,'50', '70', '100',], #now it should be correct
                'variable': variable,
                'area': [
                    90, 8, 45,
                    9,
                ],
                'time': [
                    '00:00', '01:00', '02:00',
                    '03:00', '04:00', '05:00',
                    '06:00', '07:00', '08:00',
                    '09:00', '10:00', '11:00',
                    '12:00', '13:00', '14:00',
                    '15:00', '16:00', '17:00',
                    '18:00', '19:00', '20:00',
                    '21:00', '22:00', '23:00',
                ],
                'day': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                    '13', '14', '15',
                    '16', '17', '18',
                    '19', '20', '21',
                    '22', '23', '24',
                    '25', '26', '27',
                    '28', '29', '30',
                    '31',
                ],
                'month': month,
                'year': year,
            },
            file_path
        )

        return str(request)

if pv_run_era5_download is True:
    Wrapper.main(data_folder = data_folder)

In [None]:
class Agg_pv():

    data = None
    pv_data :str = None

    def main(data_folder):

        Agg_pv.data = data_folder
        Agg_pv.pv_data = os.path.join(data_folder,"raw_pv")

        files : list        = os.listdir(Agg_pv.pv_data)
        csv_files : list    = [file for file in files if file[-4:] == ".csv"]

        dfs : list          = []

        for csv in csv_files:

            print(f"Processing: {csv}", end = "\r")
            df = pd.read_csv(os.path.join(Agg_pv.pv_data, csv))
            df = Agg_pv.aggregate(df)
            dfs.append(df)

        df = Agg_pv.merge(dfs)
        
        #add base infos
        df = Agg_pv.calculate_wind_speeds(df)
        df = Agg_pv.calculate_wind_direction(df)

        Agg_pv.save(df, data_folder)

        del dfs, df
        return

    def aggregate(df):

        #generate gorup index as date
        df.reset_index(drop = False, inplace = True)
        df["time"] = pd.to_datetime(df["time"])
        df["date"] = df["time"].dt.date

        #set date as index
        df.set_index("date", drop = True, inplace = True)

        #drop time column
        df.drop(labels = ["time"], inplace = True, axis = 1)

        #define grid resolution
        df["longitude"] = df["longitude"].round()
        df["latitude"] = df["latitude"].round()

        #aggreagte data
        df = df.groupby(["date", "longitude","latitude", "level"], as_index = True).mean()

        return df

    def merge(dfs):

        df = pd.concat(dfs)
        #df.sort_values(by = "date", inplace = True)

        return df

    def calculate_wind_speeds(df):

        df['speed'] = np.sqrt(df['u']**2 + df['v']**2)
        return df

    def calculate_wind_direction(df):

        df['direction'] = np.rad2deg(np.arctan2(df['u'], df['v'])) % 360
        df['direction'] = (df['direction'] + 90) % 360

        return df

    def save(df, data_folder):

        df.rename(mapper = {
            "longitude" : "lon",
            "latitude" : "lat",
            "u" : "wind_u",
            "v" : "wind_v",
            "t" : "temp",
        })

        if "index" in df.index.to_list():
            df.drop(labels = "index", axis = 1, inplace = True)

        df.to_csv(os.path.join(data_folder, "df_pv.csv"))
        return

if pv_compile_df is True:
    Agg_pv.main(data_folder)

In [None]:
df_pv = pd.read_csv(os.path.join(data_folder, "df_pv.csv"), index_col = "date")
df_pv.head(10)

In [None]:
del df_pv

## 0.7 Merge all (execpt custom polar vortex, date base)

In [None]:
save_data = True

In [None]:
folder_contents = os.listdir(data_folder)
csv_files = ["df_ao.csv", "df_nao.csv", "df_mjo.csv", "df_soi.csv", "df_t2m.csv"]

dfs : list = []
print(csv_files)

#read files
for csv in csv_files:

    csv_path = os.path.join(data_folder, csv)
    df = pd.read_csv(csv_path, index_col = "date")
    print(df.columns)

    dfs.append(df)

#merge files
df = dfs[0].join(other = dfs[1:])

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
#reset index
df.reset_index(drop = False, inplace = True)

#presumed cleaned data range
start = "1979-01-01"
end = "2022-12-31"

#set
df = df.loc[df["date"].between(start, end)]
df.set_index("date", drop = True, inplace = True)

#clean up
[df.drop(labels = [col], inplace = True, axis = 1) for col in df.columns.tolist() if col == "index"]

#chech
df.isna().sum()

In [None]:
#addint date time values to df for plotting
df["date_temp"] = pd.to_datetime(df.index)

#set values
df["year"] = df["date_temp"].dt.year
df["month"] = df["date_temp"].dt.month
df["day"] = df["date_temp"].dt.day

df.drop(labels = "date_temp", inplace = True, axis = 1)

df.head()

In [None]:
df.tail()

In [None]:
#save master data frame
if save_data is True:
    df.to_csv(os.path.join(data_folder, "df.csv"))