In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from tqdm import tqdm


def cal_total_metrics(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe = dataframe.dropna(axis=0, how="any")

    y_true = dataframe["true"].values.reshape(-1)
    y_pred = dataframe["pred"].values.reshape(-1)

    res = {"r2_total": [r2_score(y_true, y_pred)], "r_total": [np.corrcoef(y_true, y_pred)[0][1]], 
           "rmse_total": [mean_squared_error(y_true, y_pred, squared=False)], 
           "ubrmse_total": [mean_squared_error(y_true - y_true.mean(), y_pred - y_pred.mean(), squared=False)]}

    df_res = pd.DataFrame(res)
    return df_res


def cal_metrics(df: pd.DataFrame):
    ubrmse = mean_squared_error(df['true'] - df['true'].mean(), df['pred'] - df['pred'].mean(), squared=False)
    rmse = mean_squared_error(df['true'], df['pred'], squared=False)
    r = np.corrcoef(df['true'], df['pred'])[0][1]
    return pd.Series(dict(rmse=rmse, r=r, ubrmse=ubrmse))


df_metrics = pd.read_csv("../../data/plot/metrics.csv")
df_metrics["exp"] = df_metrics["exp"].map({
    "AutoML": "AutoML_16_600", 
    "RF": "RF_16_600", 
    "ERA5": "ERA5", 
    "GLDAS": "GLDAS", 
    "SMCI": "SMCI"
})

lst = []
for s in tqdm(["spatial", "temporal"]):
    for l in range(1, 6):
        ensemble_lst = []
        cols = []
        for estimator in ["catboost", "extra_tree", "lgbm", "rf", "RF", "xgb_limitdepth", "xgboost"]:
            df_log = pd.read_csv(f"../../discussion/logs/{estimator}_split_method_{s}_layer_layer{l}_iid_adversial_validation_time_budget_2400/results.csv")
            df_res = pd.read_csv(f"../../discussion/test_results/{estimator}_split_method_{s}_layer_layer{l}_iid_adversial_validation_time_budget_2400/test_results.csv")

            lst.append(pd.concat([df_log, cal_total_metrics(df_res.copy()), pd.DataFrame({"exp": [f"{estimator}_64_2400"], "layer": [f"layer{l}"], "split_method": [s]})], axis=1))

            if estimator == "RF":
                continue

            if len(ensemble_lst) == 0:
                ensemble_lst.append(df_res.rename(columns={"pred": estimator}))
            else:
                ensemble_lst.append(df_res[["pred"]].rename(columns={"pred": estimator}))

            cols.append(estimator)

        df_ensemble = pd.concat(ensemble_lst, axis=1)
        df_ensemble["pred"] = df_ensemble[cols].mean(axis=1)

        df_ensemble = df_ensemble[["station_idx", "pred", "true"]].copy()

        lst.append(pd.concat([
            pd.DataFrame(df_ensemble.groupby("station_idx").apply(cal_metrics).reset_index(drop=True).mean()).T, 
            cal_total_metrics(df_ensemble.copy()), 
            pd.DataFrame({"exp": ["ensemble_64_2400"], "layer": [f"layer{l}"], "split_method": [s]})
        ], axis=1))



df_discussion_metrics = pd.concat(lst, axis=0, ignore_index=True)
df_discussion_metrics = pd.concat([df_metrics, df_discussion_metrics], axis=0, ignore_index=True)

df_discussion_metrics.to_csv("../../data/plot/discussion_metrics.csv", index=False)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
100%|██████████| 2/2 [00:05<00:00,  2.73s/it]


In [27]:
def cal_metrics(df: pd.DataFrame):
    ubrmse = mean_squared_error(df['true'] - df['true'].mean(), df['pred'] - df['pred'].mean(), squared=False)
    rmse = mean_squared_error(df['true'], df['pred'], squared=False)
    # count
    count = len(df)
    r = np.corrcoef(df['true'], df['pred'])[0][1]
    return pd.Series(dict(rmse=rmse, r=r, ubrmse=ubrmse, count=count))


import os
save_root = "../../data/plot/discussion_insitu_metrics"
os.makedirs(save_root, exist_ok=True)

for s in ["spatial", "temporal"]:
    for l in tqdm(range(1, 6)):
        ensemble_lst = []
        cols = []
        for estimator in ["catboost", "extra_tree", "lgbm", "rf", "RF", "xgb_limitdepth", "xgboost"]:
            df_res = pd.read_csv(f"../../discussion/test_results/{estimator}_split_method_{s}_layer_layer{l}_iid_adversial_validation_time_budget_2400/test_results.csv")
            df_metrics = df_res.copy()
            df_metrics = df_metrics.groupby("station_idx").apply(cal_metrics).reset_index(drop=False)
            df_metrics.to_csv(os.path.join(save_root, f"{estimator}_{s}_layer{l}.csv"), index=False)

            if estimator == "RF":
                continue

            if len(ensemble_lst) == 0:
                ensemble_lst.append(df_res.rename(columns={"pred": estimator}))
            else:
                ensemble_lst.append(df_res[["pred"]].rename(columns={"pred": estimator}))

            cols.append(estimator)

        df_ensemble = pd.concat(ensemble_lst, axis=1)
        df_ensemble["pred"] = df_ensemble[cols].mean(axis=1)

        df_ensemble = df_ensemble[["station_idx", "pred", "true"]].copy()

        df_ensemble_metrics = df_ensemble.copy()
        df_ensemble_metrics = df_ensemble_metrics.groupby("station_idx").apply(cal_metrics).reset_index(drop=False)

        df_ensemble_metrics.to_csv(os.path.join(save_root, f"ensemble_{s}_layer{l}.csv"), index=False)


100%|██████████| 5/5 [00:09<00:00,  1.89s/it]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /=

In [5]:
import pandas as pd
from tqdm import tqdm

import os

metrics = ["r", "rmse", "ubrmse"]
estimators = ["catboost", "extra_tree", "lgbm", "rf", "RF", "xgb_limitdepth", "xgboost", "ensemble"]

save_root = "../../data/plot/discussion_insitu_metrics_summary"
os.makedirs(save_root, exist_ok=True)

for l in tqdm(range(1, 6)):
    for s_m in ["spatial", "temporal"]:
        df_automl_16_600 = pd.read_csv(f"../../data/plot/insitu_metrics/automl_{s_m}_layer{l}.csv")
        df_rf_16_600 = pd.read_csv(f"../../data/plot/insitu_metrics/rf_{s_m}_layer{l}.csv")

        df_automl_16_600.drop("count", axis=1, inplace=True)
        df_rf_16_600.drop("count", axis=1, inplace=True)

        df_automl_16_600.rename(columns={metric: f"automl_16_600_{metric}" for metric in metrics}, inplace=True)
        df_rf_16_600.rename(columns={metric: f"rf_16_600_{metric}" for metric in metrics}, inplace=True)

        exp_lst = [df_automl_16_600, df_rf_16_600]

        for estimator in estimators:
            df = pd.read_csv(f"../../data/plot/discussion_insitu_metrics/{estimator}_{s_m}_layer{l}.csv")
            df.rename(columns={metric: f"{estimator}_64_2400_{metric}" for metric in metrics}, inplace=True)

            df.drop("count", axis=1, inplace=True)

            exp_lst.append(df)

        df_out  =exp_lst[0]
        for df_item in exp_lst[1:]:
            df_out = pd.merge(df_out, df_item, on="station_idx")

        df_out.to_csv(os.path.join(save_root, f"{s_m}_layer{l}.csv"), index=False)


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:01<00:00,  3.21it/s]


In [6]:
save_root = "../../data/plot/discussion_insitu_metrics_summary_split"
os.makedirs(save_root, exist_ok=True)

for l in tqdm(range(1, 6)):
    for s_m in ["spatial", "temporal"]:
        df = pd.read_csv(f"../../data/plot/discussion_insitu_metrics_summary/{s_m}_layer{l}.csv")
        for metric in metrics:
            cols = ["station_idx", f"rf_16_600_{metric}", f"automl_16_600_{metric}"] + [f"{estimator}_64_2400_{metric}" for estimator in estimators]

            df_out = df[cols].copy()

            df_out.to_csv(os.path.join(save_root, f"{s_m}_layer{l}_{metric}.csv"), index=False)


100%|██████████| 5/5 [00:00<00:00, 16.74it/s]


In [7]:
save_root = "../../data/plot/discussion_insitu_metrics_summary_compare"
os.makedirs(save_root, exist_ok=True)

for l in tqdm(range(1, 6)):
    for s_m in ["spatial", "temporal"]:
        for metric in metrics:
            df = pd.read_csv(f"../../data/plot/discussion_insitu_metrics_summary_split/{s_m}_layer{l}_{metric}.csv")

            df.dropna(axis=0, how="all", inplace=True)

            if metric == "r":
                df["best"] = df.drop("station_idx", axis=1).idxmax(axis=1)
            else:
                df["best"] = df.drop("station_idx", axis=1).idxmin(axis=1)

            df.to_csv(os.path.join(save_root, f"{s_m}_layer{l}_{metric}.csv"), index=False)
            

100%|██████████| 5/5 [00:00<00:00, 12.01it/s]


In [8]:
filenames = os.listdir("../../data/plot/discussion_insitu_metrics_summary_compare")

save_root = "../../data/plot/discussion_insitu_metrics_summary_compare_concat"
os.makedirs(save_root, exist_ok=True)

for l in tqdm(range(1, 6)):
    lst = [x for x in filenames if x.split("_")[1] == f"layer{l}"]

    df_lst = []
    for item in lst:
        df = pd.read_csv(os.path.join("../../data/plot/discussion_insitu_metrics_summary_compare", item))

        df_lst.append(df[["best"]])

    df_out = pd.concat(df_lst, axis=0 ,ignore_index=True)

    df_out.to_csv(os.path.join(save_root, f"layer{l}.csv"), index=False)


100%|██████████| 5/5 [00:01<00:00,  4.95it/s]


In [9]:
for l in range(1, 6):
    df = pd.read_csv(f"../../data/plot/discussion_insitu_metrics_summary_compare_concat/layer{l}.csv")

    df["best"] = df["best"].map(
        {
            f"{x}_{metric}": x for x in ["rf_16_600", "automl_16_600"] + [f"{estimator}_64_2400" for estimator in estimators] for metric in metrics
        }
    )

    print('---------------------------------------------------------------------')
    print(f"layer{l}")

    print("#####################################################")
    print(df.value_counts())

    print("#####################################################")
    print(df.value_counts(normalize=True))

    

---------------------------------------------------------------------
layer1
#####################################################
best                  
extra_tree_64_2400        48
xgboost_64_2400           40
xgb_limitdepth_64_2400    25
lgbm_64_2400              24
catboost_64_2400          22
rf_16_600                 22
ensemble_64_2400          21
automl_16_600             17
RF_64_2400                14
rf_64_2400                10
Name: count, dtype: int64
#####################################################
best                  
extra_tree_64_2400        0.197531
xgboost_64_2400           0.164609
xgb_limitdepth_64_2400    0.102881
lgbm_64_2400              0.098765
catboost_64_2400          0.090535
rf_16_600                 0.090535
ensemble_64_2400          0.086420
automl_16_600             0.069959
RF_64_2400                0.057613
rf_64_2400                0.041152
Name: proportion, dtype: float64
---------------------------------------------------------------------


In [16]:
lst = []
for l in range(1, 6):
    df = pd.read_csv(f"../../data/plot/discussion_insitu_metrics_summary_compare_concat/layer{l}.csv")

    df["best"] = df["best"].map(
        {
            f"{x}_{metric}": x for x in ["rf_16_600", "automl_16_600"] + [f"{estimator}_64_2400" for estimator in estimators] for metric in metrics
        }
    )

    df_count_number = df.value_counts()
    df_count_percent = df.value_counts(normalize=True)

    df_count_number = pd.DataFrame(df_count_number).T.reset_index(drop=True)
    df_count_number["layer"] = f"layer{l}"

    df_count_percent = pd.DataFrame(df_count_percent).T.reset_index(drop=True)
    df_count_percent["layer"] = f"layer{l}"

    lst.append(df_count_number)
    lst.append(df_count_percent)  

df_out = pd.concat(lst, axis=0, ignore_index=True)

df_out.to_csv("../../data/plot/discussion_table_0.csv", index=False)