In [181]:
import pandas as pd
import carbonpipeline.cli as cli
from carbonpipeline.constants import *
import cfgrib
import xarray as xr

In [182]:
df = pd.read_csv("predictors.csv")

In [183]:
filtered_df = cli.filtered_and_renamed_columns(df, COLUMN_NAME_MAPPING)
filtered_df['timestamp'] = pd.to_datetime(filtered_df['timestamp'])

In [184]:
filtered_df

Unnamed: 0,CO2,G,H,LE,LW_IN,LW_OUT,NETRAD,PA,PPFD_IN,PPFD_OUT,P,RH,SW_IN,SW_OUT,TA,USTAR,VPD,WD,WS,timestamp
0,,,,,,,,,,,,,,,,,,,,1994-01-01 00:00:00
1,,,,,,,,,,,,,,,,,,,,1994-01-01 01:00:00
2,,,,,,,,,,,,,,,,,,,,1994-01-01 02:00:00
3,,,,,,,,,,,,,,,,,,,,1994-01-01 03:00:00
4,,,,,,,,,,,,,,,,,,,,1994-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236683,-0.301087,-0.167411,-0.173071,-0.162175,-0.360071,,,,,,,,-0.5,,-0.016972,,-0.495032,,,2020-12-31 19:00:00
236684,-0.301083,-0.167410,-0.173181,-0.162261,-0.360130,,,,,,,,-0.5,,-0.016956,,-0.495005,,,2020-12-31 20:00:00
236685,-0.301080,-0.167409,-0.173203,-0.162386,-0.360191,,,,,,,,-0.5,,-0.016956,,-0.494991,,,2020-12-31 21:00:00
236686,-0.301077,-0.167408,-0.173198,-0.162516,-0.360256,,,,,,,,-0.5,,-0.016956,,-0.494991,,,2020-12-31 22:00:00


In [185]:
input_vars = ['CO2', 'G', 'LW_IN']
renamed_vars = map(lambda var: "CS, " + var, input_vars)
a = filtered_df.rename(columns=dict(zip(input_vars, renamed_vars)))
a

Unnamed: 0,"CS, CO2","CS, G",H,LE,"CS, LW_IN",LW_OUT,NETRAD,PA,PPFD_IN,PPFD_OUT,P,RH,SW_IN,SW_OUT,TA,USTAR,VPD,WD,WS,timestamp
0,,,,,,,,,,,,,,,,,,,,1994-01-01 00:00:00
1,,,,,,,,,,,,,,,,,,,,1994-01-01 01:00:00
2,,,,,,,,,,,,,,,,,,,,1994-01-01 02:00:00
3,,,,,,,,,,,,,,,,,,,,1994-01-01 03:00:00
4,,,,,,,,,,,,,,,,,,,,1994-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236683,-0.301087,-0.167411,-0.173071,-0.162175,-0.360071,,,,,,,,-0.5,,-0.016972,,-0.495032,,,2020-12-31 19:00:00
236684,-0.301083,-0.167410,-0.173181,-0.162261,-0.360130,,,,,,,,-0.5,,-0.016956,,-0.495005,,,2020-12-31 20:00:00
236685,-0.301080,-0.167409,-0.173203,-0.162386,-0.360191,,,,,,,,-0.5,,-0.016956,,-0.494991,,,2020-12-31 21:00:00
236686,-0.301077,-0.167408,-0.173198,-0.162516,-0.360256,,,,,,,,-0.5,,-0.016956,,-0.494991,,,2020-12-31 22:00:00


In [186]:
import numpy as np

df = a.copy()
df[[("ERA5, " + var) for var in input_vars]] = np.nan
df

Unnamed: 0,"CS, CO2","CS, G",H,LE,"CS, LW_IN",LW_OUT,NETRAD,PA,PPFD_IN,PPFD_OUT,...,SW_OUT,TA,USTAR,VPD,WD,WS,timestamp,"ERA5, CO2","ERA5, G","ERA5, LW_IN"
0,,,,,,,,,,,...,,,,,,,1994-01-01 00:00:00,,,
1,,,,,,,,,,,...,,,,,,,1994-01-01 01:00:00,,,
2,,,,,,,,,,,...,,,,,,,1994-01-01 02:00:00,,,
3,,,,,,,,,,,...,,,,,,,1994-01-01 03:00:00,,,
4,,,,,,,,,,,...,,,,,,,1994-01-01 04:00:00,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236683,-0.301087,-0.167411,-0.173071,-0.162175,-0.360071,,,,,,...,,-0.016972,,-0.495032,,,2020-12-31 19:00:00,,,
236684,-0.301083,-0.167410,-0.173181,-0.162261,-0.360130,,,,,,...,,-0.016956,,-0.495005,,,2020-12-31 20:00:00,,,
236685,-0.301080,-0.167409,-0.173203,-0.162386,-0.360191,,,,,,...,,-0.016956,,-0.494991,,,2020-12-31 21:00:00,,,
236686,-0.301077,-0.167408,-0.173198,-0.162516,-0.360256,,,,,,...,,-0.016956,,-0.494991,,,2020-12-31 22:00:00,,,


In [187]:
raw_cols = df.columns.tolist()

levels = []
for col in raw_cols:
    parts = col.split(', ')        
    if len(parts) == 2:
        src, var = parts           # Inputs vars => has both CS and ERA5 levels
    else:
        src, var = 'CS', parts[0]  # Vars not needed to be requested => doesn't have ERA5 level
    levels.append((var, src))

df.columns = pd.MultiIndex.from_tuples(levels, names=['variable','source']) # Variable for outer level, source en inner level

df = df.sort_index(axis=1, level='variable') # Group by variable
df

variable,CO2,CO2,G,G,H,LE,LW_IN,LW_IN,LW_OUT,NETRAD,...,PPFD_OUT,RH,SW_IN,SW_OUT,TA,USTAR,VPD,WD,WS,timestamp
source,CS,ERA5,CS,ERA5,CS,CS,CS,ERA5,CS,CS,...,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS
0,,,,,,,,,,,...,,,,,,,,,,1994-01-01 00:00:00
1,,,,,,,,,,,...,,,,,,,,,,1994-01-01 01:00:00
2,,,,,,,,,,,...,,,,,,,,,,1994-01-01 02:00:00
3,,,,,,,,,,,...,,,,,,,,,,1994-01-01 03:00:00
4,,,,,,,,,,,...,,,,,,,,,,1994-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236683,-0.301087,,-0.167411,,-0.173071,-0.162175,-0.360071,,,,...,,,-0.5,,-0.016972,,-0.495032,,,2020-12-31 19:00:00
236684,-0.301083,,-0.167410,,-0.173181,-0.162261,-0.360130,,,,...,,,-0.5,,-0.016956,,-0.495005,,,2020-12-31 20:00:00
236685,-0.301080,,-0.167409,,-0.173203,-0.162386,-0.360191,,,,...,,,-0.5,,-0.016956,,-0.494991,,,2020-12-31 21:00:00
236686,-0.301077,,-0.167408,,-0.173198,-0.162516,-0.360256,,,,...,,,-0.5,,-0.016956,,-0.494991,,,2020-12-31 22:00:00


In [188]:
miss = filtered_df[filtered_df[filtered_df.columns.drop('timestamp')].isnull().any(axis=1)].copy()
miss['year']  = miss['timestamp'].dt.year
miss['month'] = miss['timestamp'].dt.month
miss['day'] = miss['timestamp'].dt.day

In [189]:
combined = cli.merge_datasets()
combined

failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)
failed to set key 'missingValue' to np.float32(3.4028235e+38)


Unnamed: 0_level_0,number,step,surface,latitude,longitude,valid_time,forecast_albedo
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1994-01-01 00:00:00,0,0 days,0.0,74.88,44.88,1994-01-01 00:00:00,0.059998
1994-01-01 01:00:00,0,0 days,0.0,74.88,44.88,1994-01-01 01:00:00,0.059998
1994-01-01 02:00:00,0,0 days,0.0,74.88,44.88,1994-01-01 02:00:00,0.059998
1994-01-01 03:00:00,0,0 days,0.0,74.88,44.88,1994-01-01 03:00:00,0.059998
1994-01-01 04:00:00,0,0 days,0.0,74.88,44.88,1994-01-01 04:00:00,0.059998
1994-01-01 05:00:00,0,0 days,0.0,74.88,44.88,1994-01-01 05:00:00,0.059998


In [190]:
combined = combined.rename(columns=SHORTNAME_TO_FULLNAME)

In [191]:
combined.index.names

FrozenList(['time'])