In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from xgboost import XGBRegressor

In [2]:
training_data = pd.read_csv('/kaggle/input/mekong-training-testing-set/landsat7_train.csv')
testing_data = pd.read_csv('/kaggle/input/mekong-training-testing-set/landsat7_test.csv')

In [3]:
# Change the type into date time
def into_datetime(df):
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year

In [4]:
into_datetime(training_data)
into_datetime(testing_data)

In [5]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 38 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   AWEInsh    56 non-null     float64       
 1   AWEIsh     56 non-null     float64       
 2   MNDWI      56 non-null     float64       
 3   NDWI       56 non-null     float64       
 4   SR_B1      56 non-null     float64       
 5   SR_B2      56 non-null     float64       
 6   SR_B3      56 non-null     float64       
 7   SR_B4      56 non-null     float64       
 8   SR_B5      56 non-null     float64       
 9   SR_B7      56 non-null     float64       
 10  ST_B6      56 non-null     float64       
 11  WI1        56 non-null     float64       
 12  WI2        56 non-null     float64       
 13  date       56 non-null     datetime64[ns]
 14  pivot      56 non-null     int64         
 15  reducer    56 non-null     object        
 16  source     56 non-null     object        
 17 

In [6]:
testing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 38 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   AWEInsh    11 non-null     float64       
 1   AWEIsh     11 non-null     float64       
 2   MNDWI      11 non-null     float64       
 3   NDWI       11 non-null     float64       
 4   SR_B1      11 non-null     float64       
 5   SR_B2      11 non-null     float64       
 6   SR_B3      11 non-null     float64       
 7   SR_B4      11 non-null     float64       
 8   SR_B5      11 non-null     float64       
 9   SR_B7      11 non-null     float64       
 10  ST_B6      11 non-null     float64       
 11  WI1        11 non-null     float64       
 12  WI2        11 non-null     float64       
 13  date       11 non-null     datetime64[ns]
 14  pivot      11 non-null     int64         
 15  reducer    11 non-null     object        
 16  source     11 non-null     object        
 17 

In [7]:
features = [ 'SR_B4', 'SR_B5', 'SR_B7',
    'ST_B6', 'WI1', 'WI2', 'month', 'year', 'Latitude', 'Longitude'
]

In [8]:
target = 'max'

In [9]:
# Split data
X_train = training_data[features]
y_train = training_data[target]
X_test = testing_data[features]
y_test = testing_data[target]

In [10]:
param_grid = {
    'n_estimators': [100, 120, 150],
    'max_depth': [2, 3],
    'learning_rate': [0.015, 0.02, 0.025],
    'subsample': [0.6, 0.7],
    'colsample_bytree': [0.6, 0.7],
    'reg_alpha': [0.5, 1, 2],
    'reg_lambda': [10, 20, 30]
}

In [11]:
# Initialize model
#xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

In [12]:
grid_search = GridSearchCV(
    estimator=XGBRegressor(objective='reg:squarederror', random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

In [13]:
# Fit the model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[CV] END colsample_bytree=0.6, learning_rate=0.015, max_depth=2, n_estimators=100, reg_alpha=0.5, reg_lambda=10, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.015, max_depth=2, n_estimators=100, reg_alpha=0.5, reg_lambda=10, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.015, max_depth=2, n_estimators=100, reg_alpha=0.5, reg_lambda=10, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.015, max_depth=2, n_estimators=100, reg_alpha=0.5, reg_lambda=20, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.015, max_depth=2, n_estimators=100, reg_alpha=0.5, reg_lambda=20, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.015, max_depth=2, n_estimators=100, reg_alpha=0.5, reg_lambda=30, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.015, ma

In [14]:
# Get best model
best_xgb = grid_search.best_estimator_
print("✅ Best parameters:", grid_search.best_params_)

✅ Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.025, 'max_depth': 3, 'n_estimators': 150, 'reg_alpha': 1, 'reg_lambda': 10, 'subsample': 0.6}


In [15]:
def wmape(y_true, y_pred):
    return 100 * np.sum(np.abs(y_true - y_pred)) / (np.sum(np.abs(y_true)) + 1e-8)

def evaluate_model(model, X, y, dataset_name=""):
    y_pred = model.predict(X)
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    wmape_score = wmape(y, y_pred)

    print(f"\n📊 Evaluation on {dataset_name} Set:")
    print(f"• RMSE  : {rmse:.4f}")
    print(f"• R²    : {r2:.4f}")
    print(f"• MAE   : {mae:.4f}")
    print(f"• WMAPE : {wmape_score:.2f}%")

In [16]:
# Evaluate on both sets
evaluate_model(best_xgb, X_train, y_train, "Training")



📊 Evaluation on Training Set:
• RMSE  : 3.6025
• R²    : 0.7807
• MAE   : 2.7997
• WMAPE : 26.45%


In [17]:
evaluate_model(best_xgb, X_test, y_test, "Testing")


📊 Evaluation on Testing Set:
• RMSE  : 3.6385
• R²    : 0.7372
• MAE   : 3.1064
• WMAPE : 41.88%
