In [None]:
import pandas as pd

In [None]:
df = pd.read_parquet("../../data/processed/merged_df.parquet")
df

In [None]:
import src.experiment.soil_xgboost as sx
config = sx.SoilXGBoostConfig(stride=1, context_length=24 * 7, prediction_length=24, y_cols=['SWC_5', 'SWC_10', 'SWC_20', 'SWC_50'])
X, y = sx.SoilXGBoostExperiment(df, config=config)()
for target in config.y_cols:
    y[target] = y[target] - X[target]



In [None]:
import numpy as np
pd.DataFrame(y).describe()

In [None]:
import xgboost
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import root_mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso
from catboost import CatBoostRegressor
tscv = TimeSeriesSplit()


X = np.array(X)
ss_x = StandardScaler()
X = ss_x.fit_transform(X)

y_ = np.array(y)
y_ = y_[:, 0].reshape(-1, 1)
ss_y = StandardScaler()
y_ = ss_y.fit_transform(y_)

fold_metrics = []
predictions_list = []

for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y_[train_idx], y_[test_idx]
    
    # model = CatBoostRegressor(verbose=False)
    # model = xgboost.XGBRegressor()
    # model = LinearRegression()
    model = Lasso(alpha=0.02)
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test).reshape(-1, 1)
    
    # Calculate metrics: RMSE and MAPE
    train_rmse = root_mean_squared_error(y_train, y_train_pred)
    train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
    
    rmse = root_mean_squared_error(y_test, y_pred)
    mape_val = mean_absolute_percentage_error(y_test, y_pred)
    # swc_20_mape = mean_absolute_percentage_error(y_test[:, 3], y_pred[:, 3])
    baseline = root_mean_squared_error(y_test, np.zeros_like(y_test))
    fold_metrics.append({"fold": i, "RMSE": rmse, "MAPE": mape_val})
    print(f"Fold {i} - Train RMSE: {train_rmse:.3f}, RMSE: {rmse}, baseline: {baseline}")
    
    # Create a DataFrame for this fold using the original test indices as the time axis
    df_fold = pd.DataFrame({
        "time": test_idx,   # original indices of the test set
        "Actual": y_test[:, 0],
        "Predicted": y_pred[:, 0],
    })
    # Create a label for the facet that includes the fold number and metrics
    fold_label = f"Fold {i} (RMSE: {rmse:.2f}, MAPE: {mape_val:.2f})"
    df_fold["fold"] = fold_label
    
    predictions_list.append(df_fold)

In [None]:
import plotly.express as px
# Combine the DataFrames from all folds into one DataFrame
df_all = pd.concat(predictions_list, ignore_index=True)

# Melt the DataFrame to long format for plotting with Plotly Express
df_long = pd.melt(df_all, id_vars=["fold", "time"],
                  value_vars=["Actual", "Predicted"],
                  var_name="Type", value_name="Value")

# Create a faceted line chart: one row per fold
fig = px.line(df_long, x="time", y="Value", color="Type", facet_row="fold",
              title="Actual vs. Predicted Values per Fold (Time Series)",
              markers=True
            )

# Adjust the height of the figure based on the number of folds
unique_folds = df_long["fold"].nunique()
fig.update_layout(height=200 * unique_folds)

fig.show()