# 0. Data collcetion and preprocessing
- Create unifrom data resolution, format and index to join the data
- Suggeted columns:
    - index : str = year-month
    - col_0 : int = year
    - col_1 : int = momth
- Aggregate everything down to monthly intervalls if needed:
    - calculate mean
    - calculate std

In [1]:
import pandas as pd
import os
import numpy as np
import statistics as st
import plotly.graph_objects as go
import plotly.express as px

#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#data folder
data_folder : str = "data"

#decide what to execute from chapter 0.1
t2_run_era5_download : bool     = False     #ca. 550 minuntes
t2_compile_nc : bool            = False     #ca. 5 minutes
t2_compile_df : bool            = False     #ca. 2 minutes
t2_csv_clean_up : bool          = False     #ca. 1 minutes

#decide what to execute from chapter 0.2, 0.3, 0.5
save_data : bool                = False

#decide what to execute from chapter 0.4
pv_run_era5_download : bool     = False     #ca. 550 minuntes
pv_compile_nc : bool            = False     #ca. 5 minutes
pv_compile_df : bool            = False     #ca. 2 minutes
pv_csv_clean_up : bool          = False     #ca. 1 minutes


## 0.1 ERA5 data (temp)
- Source: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels?tab=overview
- Note: The data was downloaded using the era5_download.py script. The code can also be found in the cell below.
- Range: 1973 - 2023

In [2]:
#if the data needs to pe processed by the api, the exectuion is aprrox. 10 minute per year(550 min total)

#copy from era5_download_t2.py
try:
    from unittest import result
    import cdsapi #additional file needed to run. See docu
    import os
    import requests
    from datetime import datetime
    import xarray as xr
except:
    pass

#add the .cdsapi file to you user folder
#data source: https://cds.climate.copernicus.eu/cdsapp#!/home


class Wrapper():

    def main():

        start_year : int        = 1979
        end_year : int          = 2024
        all_vars : bool         = False

        years : list            = Wrapper.generate_year_list(start = start_year, end = end_year)
        variables : list        = Wrapper.generate_var_list(all = all_vars)

        #main loop for downloading data
        Wrapper.log(f"Downloading startet for range: {start_year} - {end_year}")

        for year in years:
            
            print(f"Processing {year}")
            result : str = Wrapper.request(year, variables)
            Wrapper.download_data(result = result, year = year, all_vars = all_vars)

        #tranforms and saves data as a csv for later processing in pandas
        Wrapper.generate_df()

        return

    def generate_year_list(start:int, end:int):

        year_list_str : list = [str(year) for year in range(start,end)]
        return year_list_str

    def generate_var_list( all : bool):

        if all == True:
            return ['10m_u_component_of_wind', '10m_v_component_of_wind', '2m_temperature', 'surface_pressure']
        else:
            return ['2m_temperature']

    def generate_df():

        downloads = Wrapper.download_path()
        files = os.listdir(downloads)

        for file in files:

            #open .nc files
            file = os.path.join(downloads,file)
            ds = xr.open_dataset(file)
            df = ds.to_dataframe()

            #save df
            name = f"{file[-3]}.csv"
            df.to_csv(name)

    def download_data(result:str, year:str, all_vars:bool):

        #genearte download and saving path
        path : str          = Wrapper.download_path()
        file_name : str     = f"era5_{year}_allvars{all_vars}.nc" #type nasCat data
        file_path :str      = os.path.join(path,file_name)
        print(file_path)
        
        #get download link
        try:
            link_start : int    = result.index("location=") + len ("location=")
            url : str           = result[link_start:-1]
        except:
            Wrapper.log(f"{year}: The api response does not contain a  download link")
            return

        #retrieve data from web page and save it
        try:
            response = requests.get(url)
        except:
            Wrapper.log(f"{year}: The download url is not valid")
            return

        open(file_path, "wb").write(response.content)

        return

    def download_path():

        folder_name = "era5_downloads_t2"

        if os.path.isdir(folder_name) == False:
            os.makedirs(folder_name)

        download_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), folder_name)

        return download_path

    def log(message : str):

        #create log entry
        log_time : str = datetime.now()
        message = f"{log_time},{message}\n"

        #write log entry
        file_object = open('era5_log.txt', 'a')
        file_object.write(message)
        file_object.close()

        return

    def request(year:list, variable:list):
        # see: https://www.latlong.net/

        c = cdsapi.Client()

        request = c.retrieve(
            'reanalysis-era5-single-levels',
            {
                'product_type': 'reanalysis',
                'variable': variable,
                'year': year,
                'month': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                ],
                'day': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                    '13', '14', '15',
                    '16', '17', '18',
                    '19', '20', '21',
                    '22', '23', '24',
                    '25', '26', '27',
                    '28', '29', '30',
                    '31',
                ],
                'time': [
                    '00:00', '01:00', '02:00',
                    '03:00', '04:00', '05:00',
                    '06:00', '07:00', '08:00',
                    '09:00', '10:00', '11:00',
                    '12:00', '13:00', '14:00',
                    '15:00', '16:00', '17:00',
                    '18:00', '19:00', '20:00',
                    '21:00', '22:00', '23:00',
                ],
                'area': [
                    48, 5.8, 45.7,
                    11,
                ],
                'format': 'netcdf',
            },
            'download.nc'
        )

        return str(request)

if t2_run_era5_download is True:
    Wrapper.main()

In [3]:
#create csv files from .nc files
era5_data = os.path.join("data","era5_downloads_t2")

try:
    import xarray as xr
except:
    pass

#ds = xr.open_dataset('/path/to/netcdf')
#df = ds.to_dataframe()

def compile_nc_data():

    for file in os.listdir(era5_data):

        if file[-3:] != ".nc":
            continue

        file_df = file
        file_df = file_df.replace(".nc", ".csv")
        print(file_df)

        ds = xr.open_dataarray(os.path.join(era5_data,file))
        df = ds.to_dataframe()
        df.to_csv(os.path.join(era5_data,file_df))

if t2_compile_nc is True:
    compile_nc_data()

In [4]:
#define borders
lons = [5.8,5.8,11,11,5.8]
lats = [45.7,48.0,48.0,45.7,45.7]

#create plot 
fig = go.Figure(go.Scattermapbox(
    mode = "markers+lines",
    lon = lons,
    lat = lats,
    marker = {'size': 10})
)

#adjust view
fig.update_layout(
    margin ={'l':0,'t':0,'b':0,'r':0},
    mapbox = {
        'center': {'lon': 8.4, 'lat': 46.85},
        'style': "carto-positron",
        'zoom': 5})


fig.show()

Process of averaging the temperature data:
- Calculate average grouped by the hourly time stamps (eradicate the coordinates and create and hourly average of switzerland)
- Calculate average and std grouped by by month for each csv
- Creat an all encompassing dataframe with the whole time periode

In [5]:
class Avg_era5:

    era5_data = os.path.join("data","era5_downloads_t2")
    data = "data"

    def main(data_folder):

        files : list        = os.listdir(Avg_era5.era5_data)
        csv_files : list    = [file for file in files if file[-4:] == ".csv"]

        dfs : list          = []

        for csv in csv_files:

            print(f"Processing: {csv}", end = "\r")

            df = Avg_era5.lonlat_mean(csv = csv)
            df = Avg_era5.month_mean_std(df)
            df = Avg_era5.format(df)

            dfs.append(df)

        Avg_era5.merge_and_save(dfs, data_folder)
        print("Aggregaed all data")

    def lonlat_mean(csv : str):

        #average over lon and lattitude
        df = pd.read_csv(os.path.join(Avg_era5.era5_data, csv))
        df = df[["time", "t2m"]]
        df_avg_lon_lat = df.groupby("time").mean("t2m")

        del df #free up memory
        return df_avg_lon_lat

    def month_mean_std(df):

        #create col for grouping
        df["month"] = pd.DatetimeIndex(df.index).month
        df["year"] = pd.DatetimeIndex(df.index).year

        df.reset_index(drop = True, inplace = True)

        #aggregate for mean and std
        df_mean_std = df.groupby(["year", "month"], as_index=False).agg(
            t2m_mean    = ("t2m", "mean"),
            t2m_std     = ("t2m", "std"),
        )

        del df #free up memory
        return df_mean_std

    def format(df):

        #format
        df["index"] = df["year"].astype(str) + "-" + df["month"].astype(str)
        df.set_index(keys = "index", inplace = True, drop = True)

        #drop unneeded cols
        df.drop(labels = ["year", "month"], axis = 1, inplace = True)

        #save
        #df.to_csv(os.path.join(Avg_era5.era5_data_agg,csv))

        return df

    def merge_and_save(dfs, data_folder):

        df = pd.concat(dfs)
        df.to_csv(os.path.join(data_folder, "df_era5_t2.csv"))

if t2_compile_df is True:
    Avg_era5.main(data_folder)

In [6]:
# clean up unnecessary csv

def clean_up_csv():

    era5_data = os.path.join("data","era5_downloads_t2")

    files : list        = os.listdir(era5_data)
    csv_files : list    = [file for file in files if file[-4:] == ".csv"]

    for csv in csv_files:
        os.remove(os.path.join(era5_data, csv))

if t2_csv_clean_up is True:
    clean_up_csv()

## 0.2 ENSO data
- Source: https://psl.noaa.gov/enso/mei/
- Note: The data was downloaded and prepared manually as a .csv file (reformating)
- Range: 1979 - 2023

In [7]:
df_enso_raw = pd.read_csv(os.path.join(data_folder, "raw_enso.csv"))
df_enso_raw.head()

Unnamed: 0,year,1,2,3,4,5,6,7,8,9,10,11,12
0,1979,0.47,0.26,-0.08,0.2,0.27,-0.15,-0.14,0.44,0.38,0.24,0.52,0.65
1,1980,0.35,0.19,0.41,0.59,0.55,0.62,0.62,0.15,0.2,0.09,-0.03,-0.06
2,1981,-0.33,-0.24,0.33,0.41,-0.27,-0.6,-0.51,-0.31,-0.01,-0.1,-0.22,-0.16
3,1982,-0.38,-0.47,-0.28,-0.34,-0.03,0.78,1.79,2.02,1.81,1.93,2.28,2.48
4,1983,2.57,2.74,2.68,2.79,2.89,2.02,0.75,-0.11,-0.41,-0.43,-0.43,-0.43


In [8]:
#new columns

enso_dict : dict = {
    "index" : [],
    "year" : [],
    "month" : [],
    "enso" : [],
}

#iterrate over df to retrieve values
for year in df_enso_raw["year"].to_list():
    for month in df_enso_raw.columns.to_list()[1:]:

        enso : float        = float(df_enso_raw.loc[df_enso_raw["year"] == year][str(month)])
        index : str =       f"{year}-{month}"

        enso_dict["index"].append(index)
        enso_dict["year"].append(int(year))
        enso_dict["month"].append(int(month))
        enso_dict["enso"].append(enso)

#create new df
df_enso = pd.DataFrame(data = enso_dict)
df_enso.set_index("index", drop = True, inplace = True)
df_enso.head(5)

Unnamed: 0_level_0,year,month,enso
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1979-1,1979,1,0.47
1979-2,1979,2,0.26
1979-3,1979,3,-0.08
1979-4,1979,4,0.2
1979-5,1979,5,0.27


In [9]:
#delete unneeded and flase data
df_enso = df_enso.loc[df_enso["enso"] != -999.0]

#create a df for later use
df_index_value = df_enso.copy().drop(labels = "enso", axis = 1)

if save_data is True:
    df_index_value.to_csv(os.path.join(data_folder,"df_index_value.csv"))

#drop unneeded data
df_enso.drop(labels = ["month", "year"], inplace = True, axis = 1)
df_enso.tail(5)

Unnamed: 0_level_0,enso
index,Unnamed: 1_level_1
2022-9,-1.78
2022-10,-1.75
2022-11,-1.53
2022-12,-1.26
2023-1,-1.12


In [10]:
if save_data is True:
    df_enso.to_csv(os.path.join(data_folder,"df_enso.csv"))

del df_enso_raw

## 0.3 MJO data
- Source: http://www.bom.gov.au/climate/mjo/
- Note:
    - The data was downloaded and prepared manually as a .csv file (reformating).
    - The datapoint from the year 1978 are missing. A broken up date time series does not make sense. The enso also starts from the year 1979. Therefore the data from year < 1979 is being dropped
    - The aggreagtion will be done in two ways:
        - Values of the last day of month
        - Aggregation:
            - RMM1, RMM2, apmlitude: std, mean
            - phase: mode
- Range: 1979 - 2023

In [11]:
df_mjo_raw = pd.read_csv(os.path.join(data_folder,"raw_mjo.csv"))
df_mjo_raw.head()

Unnamed: 0.1,Unnamed: 0,year,month,day,RMM1,RMM2,phase,amplitude,MissingValue=1.E36or999,unnamed
0,,1974,6,1,1.63447,1.20304,5,2.02948,Final_value:__OLR_&_NCEP_winds,
1,,1974,6,2,1.60289,1.01512,5,1.89729,Final_value:__OLR_&_NCEP_winds,
2,,1974,6,3,1.51625,1.08551,5,1.86476,Final_value:__OLR_&_NCEP_winds,
3,,1974,6,4,1.50981,1.03573,5,1.83092,Final_value:__OLR_&_NCEP_winds,
4,,1974,6,5,1.55906,1.30518,5,2.03326,Final_value:__OLR_&_NCEP_winds,


In [12]:
#understanding the mjo data
fig = px.line(
    data_frame = df_mjo_raw.loc[310:390],
    x = "RMM1",
    y = "RMM2",
    color = "month",

    title = "mjo",
    color_discrete_sequence = plt_style_s,
    width = 500,
    height = 500,

    range_x = (3,-3),
    range_y = (3,-3),
)

fig.show()

In [13]:
#drop unneeded cols
df_mjo_raw.drop(axis = 1, labels = ["Unnamed: 0","unnamed", "MissingValue=1.E36or999"], inplace = True)
df_mjo = df_mjo_raw.loc[df_mjo_raw["year"] >= 1979]
df_mjo.head()

Unnamed: 0,year,month,day,RMM1,RMM2,phase,amplitude
1675,1979,1,1,0.142507,1.05047,6,1.06009
1676,1979,1,2,-0.2042,1.37361,7,1.3887
1677,1979,1,3,-0.15861,1.53943,7,1.54758
1678,1979,1,4,-0.18245,1.45972,7,1.47108
1679,1979,1,5,-0.32005,1.1368,7,1.181


In [14]:
#create new index for joining later
df_mjo["index"] = df_mjo["year"].astype(str) + "-" + df_mjo["month"].astype(str)
df_mjo.set_index(keys = "index", drop = True, inplace= True)
df_mjo.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,year,month,day,RMM1,RMM2,phase,amplitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1979-1,1979,1,1,0.142507,1.05047,6,1.06009
1979-1,1979,1,2,-0.2042,1.37361,7,1.3887
1979-1,1979,1,3,-0.15861,1.53943,7,1.54758
1979-1,1979,1,4,-0.18245,1.45972,7,1.47108
1979-1,1979,1,5,-0.32005,1.1368,7,1.181


In [15]:
#create df aggregated with last values

#empty dataframe with same structure
df_mjo_last = df_mjo.iloc[0:0,:]

indexes = list(set(df_mjo.index.to_list()))

for index in indexes:
    last = df_mjo.loc[df_mjo.index == index].query('day == day.max()')
    df_mjo_last = pd.concat(objs = [df_mjo_last, last], axis = 0)

#resort values
df_mjo_last.sort_values(by = ["year","month"], ascending = [True, True], axis = 0, inplace= True)

#clean up
df_mjo_last.drop(labels=["year","month","day"], axis = 1, inplace= True)
df_mjo_last = df_mjo_last.add_suffix("_last")
df_mjo_last.columns= df_mjo_last.columns.str.lower()

df_mjo_last.head(5)


Unnamed: 0_level_0,rmm1_last,rmm2_last,phase_last,amplitude_last
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1979-1,0.128084,-0.5824,3,0.596314
1979-2,0.891485,-0.65551,4,1.10654
1979-3,0.338955,-1.18882,3,1.23619
1979-4,-1.12556,-1.15801,2,1.61489
1979-5,-2.01605,0.075006,8,2.01744


In [16]:
if save_data is True:
    df_mjo_last.to_csv(os.path.join(data_folder, "df_mjo_last.csv"))

In [17]:
#create df aggregated with last values

#create clean df
df_mjo_avg = df_mjo.copy()
df_index_values = df_mjo_avg
df_mjo_avg.drop(labels = ["day"], axis = 1, inplace = True)

#aggregate
df_mjo_avg = df_mjo_avg.groupby("index").agg(

    rmm1_mean        = ("RMM1", "mean"),
    rmm2_mean        = ("RMM2", "mean"),
    phase_mean       = ("phase", "mean"),
    amplitude_mean   = ("amplitude", "mean"),

    rmm1_std        = ("RMM1", "std"),
    rmm2_std        = ("RMM2", "std"),
    phase_std       = ("phase", "std"),
    amplitude_std   = ("amplitude", "std"),

    phase_mode      = ("phase", st.mode),
    year            = ("year", st.mode), #used for sorting
    month           = ("month", st.mode), #usd for sorting
)

df_mjo_avg.sort_values(by = ["year","month"], ascending = [True, True], axis = 0, inplace= True)
df_mjo_avg.drop(labels = ["year","month"], axis = 1, inplace = True)


df_mjo_avg.head(3)


Unnamed: 0_level_0,rmm1_mean,rmm2_mean,phase_mean,amplitude_mean,rmm1_std,rmm2_std,phase_std,amplitude_std,phase_mode
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1979-1,-0.750029,1.106029,5.774194,1.719532,0.795243,0.988432,2.390393,0.632004,7
1979-2,0.458212,-0.745376,3.285714,1.063415,0.490798,0.587141,1.049061,0.454729,3
1979-3,0.636508,-0.322627,4.0,1.389601,1.202352,0.615136,1.788854,0.595723,4


In [18]:
if save_data is True:
    df_mjo_avg.to_csv(os.path.join(data_folder, "df_mjo_avg.csv"))

In [19]:
del df_mjo_last, df_mjo_avg, df_mjo, df_mjo_raw, df_index_values

## 0.4 Polar vortex
- Source: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-pressure-levels?tab=form
- Note:
    - The download was done over the api. The data aggregation has to be deinfed.
    - Chosen Pressure level: 10 hPa (approximates to 25 km alltitude)
    - Chose lon lat: Lat: 81 - 82 / Lon: 12 -13. The nroth pole would be 90.0000° N, 135.0000° W. But the winds at the eye of a vortex can be zero. The chosen point is offset from the north pole into the direction of siwtzerland.
- Range: 1979 - 2023

In [20]:
#pv_run_era5_download : bool     = True      #ca. 550 minuntes
#pv_compile_nc : bool            = True     #ca. 5 minutes
#pv_compile_df : bool            = True     #ca. 2 minutes
#pv_csv_clean_up : bool          = True     #ca. 1 minutes

In [21]:
from unittest import result
import cdsapi
import os
import requests
from datetime import datetime
import xarray as xr

class Wrapper():

    def main():

        start_year : int        = 1979
        end_year : int          = 2024

        years : list            = Wrapper.generate_year_list(start = start_year, end = end_year)
        variables : list        = ['u_component_of_wind', 'v_component_of_wind', 'vertical_velocity',]

        #main loop for downloading data
        Wrapper.log(f"Downloading startet for range: {start_year} - {end_year}")

        for year in years:
            
            print(f"Processing {year}")
            result : str = Wrapper.request(year)
            Wrapper.download_data(result = result, year = year)

        #tranforms and saves data as a csv for later processing in pandas
        Wrapper.generate_df()

        return

    def generate_year_list(start:int, end:int):

        year_list_str : list = [str(year) for year in range(start,end)]
        return year_list_str

    def generate_df():

        downloads = Wrapper.download_path()
        files = os.listdir(downloads)

        for file in files:

            #open .nc files
            file = os.path.join(downloads,file)
            ds = xr.open_dataset(file)
            df = ds.to_dataframe()

            #save df
            name = f"{file[-3]}.csv"
            df.to_csv(name)

    def download_data(result:str, year:str):

        #genearte download and saving path
        path : str          = Wrapper.download_path()
        file_name : str     = f"era5_{year}.nc" #type nasCat data
        file_path :str      = os.path.join(path,file_name)
        print(file_path)
        
        #get download link
        try:
            link_start : int    = result.index("location=") + len ("location=")
            url : str           = result[link_start:-1]
        except:
            Wrapper.log(f"{year}: The api response does not contain a download link")
            return

        #retrieve data from web page and save it
        try:
            response = requests.get(url)
        except:
            Wrapper.log(f"{year}: The download url is not valid")
            return

        open(file_path, "wb").write(response.content)

        return

    def download_path():

        folder_name = "era5_downloads_pv"

        if os.path.isdir(folder_name) == False:
            os.makedirs(folder_name)

        download_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), folder_name)

        return download_path

    def log(message : str):

        #create log entry
        log_time : str = datetime.now()
        message = f"{log_time},{message}\n"

        #write log entry
        file_object = open('era5_log.txt', 'a')
        file_object.write(message)
        file_object.close()

        return

    def request(year:list):
        # see: https://www.latlong.net/

        c = cdsapi.Client()

        request = c.retrieve(
            'reanalysis-era5-pressure-levels',
            {
                'product_type': 'reanalysis',
                'format': 'netcdf',
                'variable': [
                    'u_component_of_wind', 'v_component_of_wind', 'vertical_velocity',
                ],
                'pressure_level': '10',
                'year': year,
                'month': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                ],
                'day': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                    '13', '14', '15',
                    '16', '17', '18',
                    '19', '20', '21',
                    '22', '23', '24',
                    '25', '26', '27',
                    '28', '29', '30',
                    '31',
                ],
                'time': [
                    '00:00', '01:00', '02:00',
                    '03:00', '04:00', '05:00',
                    '06:00', '07:00', '08:00',
                    '09:00', '10:00', '11:00',
                    '12:00', '13:00', '14:00',
                    '15:00', '16:00', '17:00',
                    '18:00', '19:00', '20:00',
                    '21:00', '22:00', '23:00',
                ],
                'area': [
                    82, 12, 81,
                    13,
                ],
            },
            'download.nc')

        return str(request)

if pv_run_era5_download is True:
    Wrapper.main()

In [22]:
#define borders
lons = [12,12,13,13,12]
lats = [81,82,82,81,81]

#create map plot
fig = go.Figure(go.Scattermapbox(
    mode = "markers+lines",
    lon = lons,
    lat = lats,
    marker = {'size': 10})
)

#adjust view
fig.update_layout(
    margin ={'l':0,'t':0,'b':0,'r':0},
    mapbox = {
        'center': {'lon': 12.5, 'lat': 81.5},
        'style': "carto-positron",
        'zoom': 2})


fig.show()

In [23]:
#steal code from 0.1
#create csv files from .nc files
era5_data = os.path.join("data","era5_downloads_pv")

try:
    import xarray as xr
except:
    pass

#ds = xr.open_dataset('/path/to/netcdf')
#df = ds.to_dataframe()

def compile_nc_data():

    for file in os.listdir(era5_data):

        if file[-3:] != ".nc":
            continue

        file_df = file
        file_df = file_df.replace(".nc", ".csv")
        print(file_df)

        ds = xr.open_dataset(os.path.join(era5_data,file))
        df = ds.to_dataframe()
        df.to_csv(os.path.join(era5_data,file_df))

if pv_compile_nc is True:
    compile_nc_data()

In [24]:
class Avg_era5:

    era5_data = os.path.join("data","era5_downloads_pv")
    data = "data"

    def main(data_folder):

        files : list        = os.listdir(Avg_era5.era5_data)
        csv_files : list    = [file for file in files if file[-4:] == ".csv"]

        dfs : list          = []

        for csv in csv_files:

            print(f"Processing: {csv}", end = "\r")

            df = Avg_era5.lonlat_mean(csv = csv)
            df = Avg_era5.month_mean_std(df)
            df = Avg_era5.format(df)

            dfs.append(df)

        Avg_era5.merge_and_save(dfs, data_folder)
        print("Aggregaed all data")

    def lonlat_mean(csv : str):

        df = pd.read_csv(os.path.join(Avg_era5.era5_data, csv))
        df = df[["time","u","v","w"]]
        df_avg_lon_lat = df.groupby("time").mean()

        del df #free up memory
        return df_avg_lon_lat

    def month_mean_std(df):

        #create col for grouping
        df["month"] = pd.DatetimeIndex(df.index).month
        df["year"] = pd.DatetimeIndex(df.index).year

        df.reset_index(drop = True, inplace = True)

        #aggregate for mean and std
        df_mean_std = df.groupby(["year", "month"], as_index=False).agg(

            pv_u_mean      = ("u", "mean"),
            pv_u_std       = ("u", "std"),
            
            pv_v_mean      = ("v", "mean"),
            pv_v_std       = ("v", "std"),
            
            pv_w_mean      = ("w", "mean"),
            pv_w_std       = ("w", "std")
        )

        del df #free up memory
        return df_mean_std

    def format(df):

        #format
        df["index"] = df["year"].astype(str) + "-" + df["month"].astype(str)
        df.set_index(keys = "index", inplace = True, drop = True)

        #drop unneeded cols
        df.drop(labels = ["year", "month"], axis = 1, inplace = True)

        #save
        #df.to_csv(os.path.join(Avg_era5.era5_data_agg,csv))

        return df

    def merge_and_save(dfs, data_folder):

        df = pd.concat(dfs)
        df.to_csv(os.path.join(data_folder, "df_era5_pv.csv"))

if pv_compile_df is True:
    Avg_era5.main(data_folder)

In [25]:
# clean up unnecessary csv

def clean_up_csv():

    era5_data = os.path.join("data","era5_downloads_pv")

    files : list        = os.listdir(era5_data)
    csv_files : list    = [file for file in files if file[-4:] == ".csv"]

    for csv in csv_files:
        os.remove(os.path.join(era5_data, csv))

if pv_csv_clean_up is True:
    clean_up_csv()

## 0.5 Merging

In [26]:
#merging

csv_files : list = [csv for csv in os.listdir(data_folder) if csv[:3] == "df_"]
csv_files.remove("df_index_value.csv")
csv_files.sort()

df = df_index_value

for csv in csv_files:

    df_i = pd.read_csv(os.path.join(data_folder, csv), index_col = "index")
    df = pd.merge(df, df_i, left_index=True, right_index=True)

df.head().T

index,1979-1,1979-2,1979-3,1979-4,1979-5
year_x,1979.0,1979.0,1979.0,1979.0,1979.0
month_x,1.0,2.0,3.0,4.0,5.0
enso_x,0.47,0.26,-0.08,0.2,0.27
pv_u_mean_x,-7.705095,-10.069668,0.974978,-2.369439,-2.055928
pv_u_std_x,27.281851,23.430696,11.492888,4.466771,2.138089
pv_v_mean_x,-4.846178,-19.538984,13.36226,-2.108374,-0.777698
pv_v_std_x,12.622939,10.261012,15.734375,4.275214,2.176962
pv_w_mean_x,-0.000723,-0.001674,0.00016,-7e-06,-9e-05
pv_w_std_x,0.003739,0.005229,0.003275,0.00194,0.001966
t2m_mean_x,266.677674,271.801301,274.849874,276.097354,281.669298


In [27]:

if save_data is True:
    df.to_csv(os.path.join(data_folder, "df_merged.csv"))