In [12]:
import pandas as pd
import carbonpipeline.cli as cli
from carbonpipeline.constants import *
from carbonpipeline.processing_utils import *
pd.set_option('display.width', 200)           # total character width
pd.set_option('display.max_columns', None)    # don’t limit columns
pd.set_option('display.expand_frame_repr', False) 

In [13]:
df = pd.read_csv("predictors.csv")

In [14]:
filtered_df = cli.filtered_and_renamed_columns(df, COLUMN_NAME_MAPPING)
filtered_df['timestamp'] = pd.to_datetime(filtered_df['timestamp'])
filtered_df = filtered_df.loc[:5375]

In [15]:
filtered_df.head(5)

Unnamed: 0,CO2,G,H,LE,LW_IN,LW_OUT,NETRAD,PA,PPFD_IN,PPFD_OUT,P,RH,SW_IN,SW_OUT,TA,USTAR,VPD,WD,WS,timestamp
0,,,,,,,,,,,,,,,,,,,,1994-01-01 00:00:00
1,,,,,,,,,,,,,,,,,,,,1994-01-01 01:00:00
2,,,,,,,,,,,,,,,,,,,,1994-01-01 02:00:00
3,,,,,,,,,,,,,,,,,,,,1994-01-01 03:00:00
4,,,,,,,,,,,,,,,,,,,,1994-01-01 04:00:00


In [16]:
input_vars = ['P']
renamed_df = cli.dataframe_restructuration(filtered_df, input_vars)
renamed_df.head(5)

variable,CO2,G,H,LE,LW_IN,LW_OUT,NETRAD,P,P,PA,PPFD_IN,PPFD_OUT,RH,SW_IN,SW_OUT,TA,USTAR,VPD,WD,WS,timestamp
source,CS,CS,CS,CS,CS,CS,CS,CS,ERA5,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS
0,,,,,,,,,,,,,,,,,,,,,1994-01-01 00:00:00
1,,,,,,,,,,,,,,,,,,,,,1994-01-01 01:00:00
2,,,,,,,,,,,,,,,,,,,,,1994-01-01 02:00:00
3,,,,,,,,,,,,,,,,,,,,,1994-01-01 03:00:00
4,,,,,,,,,,,,,,,,,,,,,1994-01-01 04:00:00


In [17]:
miss = filtered_df[filtered_df[filtered_df.columns.drop('timestamp')].isnull().any(axis=1)].copy()
miss['year']  = miss['timestamp'].dt.year
miss['month'] = miss['timestamp'].dt.month
miss['day'] = miss['timestamp'].dt.day

In [18]:
import os


unzip_sub_fldrs = [] 
dir_ = './datasets/unzip'
for fldr in os.listdir(dir_):
    path_to_sub_fldr = os.path.join(dir_, fldr)
    if os.path.isdir(path_to_sub_fldr):
        unzip_sub_fldrs.append(path_to_sub_fldr)

df = cli.merge_datasets(unzip_sub_fldrs)


In [19]:
df_mod = df.loc[('1994-01-01 00:00:00', slice(None), slice(None))]
df_mod

Unnamed: 0_level_0,Unnamed: 1_level_0,total_precipitation
latitude,longitude,Unnamed: 2_level_1
75.125,44.875,0.000286
75.125,45.125,0.000341
74.875,44.875,0.000449
74.875,45.125,0.000505


In [20]:
df_grouped = df.groupby(['valid_time']).mean()
df_grouped


Unnamed: 0_level_0,total_precipitation
valid_time,Unnamed: 1_level_1
1994-01-01 00:00:00,3.952007e-04
1994-01-01 01:00:00,5.666020e-04
1994-01-01 02:00:00,5.888748e-04
1994-01-01 03:00:00,5.840920e-04
1994-01-01 04:00:00,4.086231e-04
...,...
1994-08-12 19:00:00,1.191765e-07
1994-08-12 20:00:00,1.191765e-07
1994-08-12 21:00:00,1.191765e-07
1994-08-12 22:00:00,1.191765e-07


In [22]:

def ameriflux_to_era5(df: pd.DataFrame, pred: str) -> np.array:
    cols = VARIABLES_FOR_PREDICTOR[pred]     
    func = PROCESSORS.get(pred)

    if func is None:
        return df[cols[0]].to_numpy()

    return df[cols].apply(lambda row: func(*row), axis=1).to_numpy()

for pred, origin in renamed_df.columns:
    if 'ERA' in origin:
        renamed_df.loc[:, (pred, 'ERA5')] = ameriflux_to_era5(df_grouped, pred)
renamed_df

variable,CO2,G,H,LE,LW_IN,LW_OUT,NETRAD,P,P,PA,PPFD_IN,PPFD_OUT,RH,SW_IN,SW_OUT,TA,USTAR,VPD,WD,WS,timestamp
source,CS,CS,CS,CS,CS,CS,CS,CS,ERA5,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS
0,,,,,,,,,3.952007e-04,,,,,,,,,,,,1994-01-01 00:00:00
1,,,,,,,,,5.666020e-04,,,,,,,,,,,,1994-01-01 01:00:00
2,,,,,,,,,5.888748e-04,,,,,,,,,,,,1994-01-01 02:00:00
3,,,,,,,,,5.840920e-04,,,,,,,,,,,,1994-01-01 03:00:00
4,,,,,,,,,4.086231e-04,,,,,,,,,,,,1994-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5371,,,,,,,,,1.191765e-07,,,,,,,,,,,,1994-08-12 19:00:00
5372,,,,,,,,,1.191765e-07,,,,,,,,,,,,1994-08-12 20:00:00
5373,,,,,,,,,1.191765e-07,,,,,,,,,,,,1994-08-12 21:00:00
5374,,,,,,,,,1.191765e-07,,,,,,,,,,,,1994-08-12 22:00:00
