In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!source /content/drive/MyDrive/colab_env/bin/activate

In [18]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_selection import SelectKBest, f_regression, RFE, SequentialFeatureSelector
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from functools import partial
from statsmodels.tools.eval_measures import rmspe
from scipy import stats
from sklearn.svm import SVR


In [6]:
daily_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/daily_data_central_valley.parquet", engine='pyarrow')
decade_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/decade_data_central_valley.parquet", engine='pyarrow')
monthly_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/monthly_data_central_valley.parquet", engine='pyarrow')
yearly_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/yearly_data_central_valley.parquet", engine='pyarrow')


In [7]:
# Factorize the column
monthly_data['category_encoded_Site_ID'], _ = pd.factorize(monthly_data['Site_ID'])
monthly_data['category_encoded_General_classification'], _ = pd.factorize(monthly_data['General_classification'])
monthly_data['category_encoded_Land_cover_details'], _ = pd.factorize(monthly_data['Land_cover_details'])
monthly_data['category_encoded_Land_cover_type'], _ = pd.factorize(monthly_data['Land_cover_type'])


In [8]:
# Custom transformer for Lasso feature selection
class LassoSelector(BaseEstimator, TransformerMixin):
    def __init__(self, n_features_to_select=10):
        self.n_features_to_select = n_features_to_select
        self.lasso = Lasso(random_state=42)

    def fit(self, X, y):
        self.lasso.fit(X, y)
        self.support_ = np.argsort(np.abs(self.lasso.coef_))[-self.n_features_to_select:]
        return self

    def transform(self, X):
        return X[:, self.support_]


In [23]:
# Assuming 'df' is your dataframe and 'target' is your target column
X = monthly_data.drop(['ET_fill',
                       'Site_ID',
                       'General_classification',
                       'Land_cover_details',
                       'Land_cover_type',
                       'aet_budyko_oudin',
                       'aet_budyko_hargreaves',
                       'aet_budyko_abtew',
                       'aet_budyko_mcguinness_bordne'], axis =1)

y = monthly_data['ET_fill']



# Define estimators
estimators = {
    'RandomForestRegressor': RandomForestRegressor(),
    'RidgeCV': RidgeCV(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'MLPRegressor': MLPRegressor()
}

# Number of features to select
n_features_to_select = 4

In [22]:

results_rfe = []
for est_name, estimator in tqdm(estimators.items()):
  if est_name not in ['KNeighborsRegressor', 'MLPRegressor']:
    selector = RFE(estimator, n_features_to_select=n_features_to_select)
    selector.fit(x_tr, y_tr)
    selected_features = X.columns[selector.support_]
    print(selected_features)
    y_tr_pred = selector.predict(x_tr)
    # train
    print(f"{est_name} in train")
    print('Mean Absolute Error:', mean_absolute_error(y_tr, y_tr_pred))
    print('Mean Squared Error:', mean_squared_error(y_tr, y_tr_pred))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_tr, y_tr_pred)))
    print('Root Mean Squared Percentage Error:', rmspe(y_tr, y_tr_pred))
    print('R2 Score:', (stats.pearsonr(y_tr_pred,y_tr)[0])**2)

    # test
    print(f"{est_name} in test")
    y_te_pred = selector.predict(x_te)
    print('Mean Absolute Error:', mean_absolute_error(y_te, y_te_pred))
    print('Mean Squared Error:', mean_squared_error(y_te, y_te_pred))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_te, y_te_pred)))
    print('Root Mean Squared Percentage Error:', rmspe(y_te, y_te_pred))
    print('R2 Score:', (stats.pearsonr(y_te_pred,y_te)[0])**2)

    results_rfe.append({
        'Method': 'RFE',
        'Estimator': est_name,
        'Selected Features': ', '.join(selected_features),
        'MAE_train':  mean_absolute_error(y_tr, y_tr_pred),
        'MSE_train': mean_squared_error(y_tr, y_tr_pred),
        'RMSE_train' : np.sqrt(mean_squared_error(y_tr, y_tr_pred)),
        'RMSPE_train' : rmspe(y_tr, y_tr_pred),
        'R2_train' : (stats.pearsonr(y_tr_pred,y_tr)[0])**2,
        'MAE_test':  mean_absolute_error(y_te, y_te_pred),
        'MSE_test': mean_squared_error(y_te, y_te_pred),
        'RMSE_test' : np.sqrt(mean_squared_error(y_te, y_te_pred)),
        'RMSPE_test' : rmspe(y_te, y_te_pred),
        'R2_test' : (stats.pearsonr(y_te_pred,y_te)[0])**2
        })

 50%|█████     | 3/6 [00:05<00:04,  1.44s/it]

Index(['oudin', 'hargreaves', 'abtew', 'latitude'], dtype='object')
RandomForestRegressor in train
Mean Absolute Error: 0.9244630100657495
Mean Squared Error: 1.724778465288024
Root Mean Squared Error: 1.3133082141249341
Root Mean Squared Percentage Error: 2.9927379277841024
R2 Score: 0.9778393477807026
RandomForestRegressor in test
Mean Absolute Error: 2.710388019778072
Mean Squared Error: 21.397397759863686
Root Mean Squared Error: 4.6257321323076726
Root Mean Squared Percentage Error: 5.539538906582343
R2 Score: 0.8368691620082844
Index(['hargreaves', 'latitude', 'category_encoded_General_classification',
       'category_encoded_Land_cover_type'],
      dtype='object')
RidgeCV in train
Mean Absolute Error: 2.936150907148082
Mean Squared Error: 15.619915085825248
Root Mean Squared Error: 3.952203826452432
Root Mean Squared Percentage Error: 9.730005896050681
R2 Score: 0.7838336986263733
RidgeCV in test
Mean Absolute Error: 2.9294818261281117
Mean Squared Error: 18.19150696448048
Roo

100%|██████████| 6/6 [00:07<00:00,  1.23s/it]

Index(['ASCE_ETo', 'hargreaves', 'abtew', 'latitude'], dtype='object')
GradientBoostingRegressor in train
Mean Absolute Error: 1.3040185294245479
Mean Squared Error: 2.9993424119783865
Root Mean Squared Error: 1.731860967854633
Root Mean Squared Percentage Error: 4.771405115961722
R2 Score: 0.9591043440120212
GradientBoostingRegressor in test
Mean Absolute Error: 2.5300586191652035
Mean Squared Error: 19.016786196941823
Root Mean Squared Error: 4.360824027284502
Root Mean Squared Percentage Error: 6.840891690677658
R2 Score: 0.8529013595887568





In [27]:
x_tr, x_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=3)

m = RandomForestRegressor(n_estimators=100)

rfr = RFE(
    estimator=m,
    n_features_to_select=4
)

rfr.fit(x_tr, y_tr)

# rfr = RFECV(
#     estimator=m,
# )

# rfr.fit(x_tr, y_tr)

print("Selected Features:")
print(X.columns[rfr.support_])


y_tr_pred = rfr.predict(x_tr)
print('Mean Absolute Error:', mean_absolute_error(y_tr, y_tr_pred))
print('Mean Squared Error:', mean_squared_error(y_tr, y_tr_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_tr, y_tr_pred)))
print('Root Mean Squared Percentage Error:', rmspe(y_tr, y_tr_pred))
print('R2 Score:', r2_score(y_tr, y_tr_pred))

Selected Features:
Index(['oudin', 'hargreaves', 'abtew', 'latitude'], dtype='object')
Mean Absolute Error: 0.9509109595109821
Mean Squared Error: 1.8198556051307118
Root Mean Squared Error: 1.3490202389626005
Root Mean Squared Percentage Error: 2.7575123744940346
R2 Score: 0.9748146473179244


In [28]:
import plotly.express as px

fig = px.scatter(
    x = y_tr_pred,
    y = y_tr,
)

fig.add_shape(
    type='line',
    x0 = 0,
    y0 = 0,
    x1 = 12,
    y1 = 12,
    line = dict(
        color = 'Red',
    )
)

fig.update_layout(
    autosize = False,
    width = 600,
    height = 600,
    xaxis_range = [0, 12],
    yaxis_range = [0, 12],
    xaxis_title = dict(text = 'ET Predicted'),
    yaxis_title = dict(text = 'ET')
)

fig.show()

In [29]:
y_te_pred = rfr.predict(x_te)

fig = px.scatter(
    x = y_te_pred,
    y = y_te,
)

fig.add_shape(
    type='line',
    x0 = 0,
    y0 = 0,
    x1 = 12,
    y1 = 12,
    line = dict(
        color = 'Red',
    )
)

fig.update_layout(
    autosize = False,
    width = 600,
    height = 600,
    xaxis_range = [0, 12],
    yaxis_range = [0, 12],
    xaxis_title = dict(text = 'ET Predicted'),
    yaxis_title = dict(text = 'ET')
)

fig.show()

# Monthly data

## SelectKBest

In [None]:
# 1. SelectKBest (optimized)
results_kbest = []
selector = SelectKBest(score_func=f_regression, k=n_features_to_select)
X_new = selector.fit_transform(X_scaled, y)
selected_features = X.columns[selector.get_support()].tolist()

for est_name, estimator in tqdm(estimators.items()):

  scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2')
  print(f"\n{est_name}:")
  print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
  results_kbest.append({
      'Method': 'SelectKBest',
      'Estimator': est_name,
      'Selected Features': ', '.join(selected_features),
      'R2 Score': scores.mean(),
      'R2 Std': scores.std()
  })




 17%|█▋        | 1/6 [00:41<03:29, 41.89s/it]


RandomForestRegressor:
Mean R-squared score: 0.085 (+/- 1.518)


 33%|███▎      | 2/6 [00:42<01:09, 17.48s/it]


RidgeCV:
Mean R-squared score: 0.469 (+/- 0.910)


 50%|█████     | 3/6 [00:42<00:29,  9.76s/it]


DecisionTreeRegressor:
Mean R-squared score: 0.043 (+/- 1.589)


 67%|██████▋   | 4/6 [00:44<00:13,  6.57s/it]


KNeighborsRegressor:
Mean R-squared score: 0.018 (+/- 1.549)


 83%|████████▎ | 5/6 [01:11<00:13, 13.85s/it]


GradientBoostingRegressor:
Mean R-squared score: 0.387 (+/- 1.059)


100%|██████████| 6/6 [14:03<00:00, 140.60s/it]


MLPRegressor:
Mean R-squared score: 0.406 (+/- 1.002)





## RFE

In [None]:
# 2. RFE (not suitable for KNN)
results_rfe = []
for est_name, estimator in tqdm(estimators.items()):
    if est_name not in ['KNeighborsRegressor', 'MLPRegressor']:
        selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
        X_new = selector.fit_transform(X_scaled, y)
        selected_features = X.columns[selector.support_].tolist()

        scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2')
        print(f"\n{est_name}:")
        print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
        results_rfe.append({
            'Method': 'RFE',
            'Estimator': est_name,
            'Selected Features': ', '.join(selected_features),
            'R2 Score': scores.mean(),
            'R2 Std': scores.std()
        })


 17%|█▋        | 1/6 [02:14<11:12, 134.46s/it]


RandomForestRegressor:
Mean R-squared score: 0.392 (+/- 0.779)


 33%|███▎      | 2/6 [02:16<03:46, 56.59s/it] 


RidgeCV:
Mean R-squared score: -1.516 (+/- 7.927)


 50%|█████     | 3/6 [02:19<01:36, 32.12s/it]


DecisionTreeRegressor:
Mean R-squared score: 0.357 (+/- 0.821)


100%|██████████| 6/6 [03:43<00:00, 37.20s/it]


GradientBoostingRegressor:
Mean R-squared score: 0.523 (+/- 0.431)





##  SelectFromModel

### tree-based models

In [None]:

# 3. Tree-based feature importance
results_tree_based = []
for est_name in ['DecisionTreeRegressor', 'GradientBoostingRegressor']:
    estimator = estimators[est_name]
    estimator.fit(X_scaled, y)
    importances = estimator.feature_importances_
    indices = np.argsort(importances)[-n_features_to_select:]
    selected_features = X.columns[indices].tolist()

    X_new = X_scaled[:, indices]
    for eval_est_name, eval_estimator in estimators.items():
        scores = cross_val_score(eval_estimator, X_new, y, cv=5, scoring='r2', n_jobs=-1)
        print(f"\n{est_name}:")
        print(f"\n{eval_est_name}:")
        print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
        results_tree_based.append({
            'Method': f'TreeBased-{est_name}',
            'Estimator': eval_est_name,
            'Selected Features': ', '.join(selected_features),
            'R2 Score': scores.mean(),
            'R2 Std': scores.std()
        })



DecisionTreeRegressor:

RandomForestRegressor:
Mean R-squared score: 0.393 (+/- 0.792)

DecisionTreeRegressor:

RidgeCV:
Mean R-squared score: -1.900 (+/- 9.084)

DecisionTreeRegressor:

DecisionTreeRegressor:
Mean R-squared score: 0.300 (+/- 0.845)

DecisionTreeRegressor:

KNeighborsRegressor:
Mean R-squared score: 0.244 (+/- 1.259)

DecisionTreeRegressor:

GradientBoostingRegressor:
Mean R-squared score: 0.714 (+/- 0.122)

DecisionTreeRegressor:

MLPRegressor:
Mean R-squared score: -2.438 (+/- 11.935)

GradientBoostingRegressor:

RandomForestRegressor:
Mean R-squared score: 0.180 (+/- 1.085)

GradientBoostingRegressor:

RidgeCV:
Mean R-squared score: 0.430 (+/- 0.954)

GradientBoostingRegressor:

DecisionTreeRegressor:
Mean R-squared score: 0.077 (+/- 1.286)

GradientBoostingRegressor:

KNeighborsRegressor:
Mean R-squared score: -0.003 (+/- 1.759)

GradientBoostingRegressor:

GradientBoostingRegressor:
Mean R-squared score: 0.407 (+/- 1.108)

GradientBoostingRegressor:

MLPRegressor

### Lasso regularization

In [None]:

results_lasso = []
# 4. Lasso regularization
lasso_selector = LassoSelector(n_features_to_select=n_features_to_select)
X_new = lasso_selector.fit_transform(X_scaled, y)
selected_features = X.columns[lasso_selector.support_].tolist()

for est_name, estimator in estimators.items():
    scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2', n_jobs=-1)
    print(f"\n{est_name}:")
    print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    results_lasso.append({
        'Method': 'Lasso',
        'Estimator': est_name,
        'Selected Features': ', '.join(selected_features),
        'R2 Score': scores.mean(),
        'R2 Std': scores.std()
    })


RandomForestRegressor:
Mean R-squared score: 0.325 (+/- 0.948)

RidgeCV:
Mean R-squared score: -1.517 (+/- 7.645)

DecisionTreeRegressor:
Mean R-squared score: 0.285 (+/- 0.683)

KNeighborsRegressor:
Mean R-squared score: 0.348 (+/- 1.035)

GradientBoostingRegressor:
Mean R-squared score: 0.672 (+/- 0.238)

MLPRegressor:
Mean R-squared score: -5.996 (+/- 26.207)


##  Sequential Feature Selection

In [None]:
results_sequential_forward = []
# 5. Sequential Feature Selection (forward)
for est_name, estimator in estimators.items():
    selector = SequentialFeatureSelector(estimator, n_features_to_select=n_features_to_select, direction='forward', n_jobs=-1)
    X_new = selector.fit_transform(X_scaled, y)
    selected_features = X.columns[selector.get_support()].tolist()

    scores = cross_val_score(estimator, X_new, y, cv=3, scoring='r2', n_jobs=-1)
    print(f"\n{est_name}:")
    print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    results_sequential_forward.append({
        'Method': 'SequentialForward',
        'Estimator': est_name,
        'Direction' : 'forward',
        'Selected Features': ', '.join(selected_features),
        'R2 Score': scores.mean(),
        'R2 Std': scores.std()
    })

KeyboardInterrupt: 

In [None]:
results_kbest_df = pd.DataFrame(results_kbest)
results_kbest_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_kbest_results.csv", index=False)


results_rfe_df = pd.DataFrame(results_rfe)
results_rfe_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_rfe_results.csv", index=False)


results_tree_based_df = pd.DataFrame(results_tree_based)
results_tree_based_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_tree_based_results.csv", index=False)


results_lasso_df = pd.DataFrame(results_lasso)
results_lasso_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_lasso_results.csv", index=False)

# yearly data

In [None]:
# Factorize the column
yearly_data['category_encoded_Site_ID'], _ = pd.factorize(yearly_data['Site_ID'])
yearly_data['category_encoded_General_classification'], _ = pd.factorize(yearly_data['General_classification'])
yearly_data['category_encoded_Land_cover_details'], _ = pd.factorize(yearly_data['Land_cover_details'])
yearly_data['category_encoded_Land_cover_type'], _ = pd.factorize(yearly_data['Land_cover_type'])

In [None]:
# Assuming 'df' is your dataframe and 'target' is your target column
X = yearly_data.drop(['ET_fill', 'Site_ID', 'General_classification', 'Land_cover_details', 'Land_cover_type'], axis =1)
y = yearly_data['ET_fill']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define estimators
estimators = {
    'RandomForestRegressor': RandomForestRegressor(),
    'RidgeCV': RidgeCV(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'MLPRegressor': MLPRegressor()
}

# Number of features to select
n_features_to_select = 4

## SelectKBest

In [None]:
# 1. SelectKBest (optimized)
results_kbest_y = []
selector = SelectKBest(score_func=f_regression, k=n_features_to_select)
X_new = selector.fit_transform(X_scaled, y)
selected_features = X.columns[selector.get_support()].tolist()

for est_name, estimator in tqdm(estimators.items()):

  scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2')
  print(f"\n{est_name}:")
  print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
  results_kbest_y.append({
      'Method': 'SelectKBest',
      'Estimator': est_name,
      'Selected Features': ', '.join(selected_features),
      'R2 Score': scores.mean(),
      'R2 Std': scores.std()
  })


 17%|█▋        | 1/6 [00:02<00:14,  2.89s/it]


RandomForestRegressor:
Mean R-squared score: -1.277 (+/- 4.718)

RidgeCV:
Mean R-squared score: -0.311 (+/- 3.145)


 67%|██████▋   | 4/6 [00:03<00:01,  1.65it/s]


DecisionTreeRegressor:
Mean R-squared score: -1.529 (+/- 3.609)

KNeighborsRegressor:
Mean R-squared score: -1.487 (+/- 5.828)


 83%|████████▎ | 5/6 [00:05<00:01,  1.16s/it]


GradientBoostingRegressor:
Mean R-squared score: -1.096 (+/- 5.076)


100%|██████████| 6/6 [01:27<00:00, 14.61s/it]


MLPRegressor:
Mean R-squared score: -0.226 (+/- 3.164)





In [None]:
results_kbest_y_df = pd.DataFrame(results_kbest_y)
results_kbest_y_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_kbest_yearly_results.csv", index=False)

## RFE

In [None]:
# 2. RFE (not suitable for KNN)
results_rfe_y = []
for est_name, estimator in tqdm(estimators.items()):
    if est_name not in ['KNeighborsRegressor', 'MLPRegressor']:
        selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
        X_new = selector.fit_transform(X_scaled, y)
        selected_features = X.columns[selector.support_].tolist()

        scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2')
        print(f"\n{est_name}:")
        print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
        results_rfe_y.append({
            'Method': 'RFE',
            'Estimator': est_name,
            'Selected Features': ', '.join(selected_features),
            'R2 Score': scores.mean(),
            'R2 Std': scores.std()
        })

 17%|█▋        | 1/6 [00:28<02:24, 28.88s/it]


RandomForestRegressor:
Mean R-squared score: -0.539 (+/- 4.188)


 50%|█████     | 3/6 [00:29<00:19,  6.63s/it]


RidgeCV:
Mean R-squared score: -6.777 (+/- 26.093)

DecisionTreeRegressor:
Mean R-squared score: -0.659 (+/- 3.641)


100%|██████████| 6/6 [00:37<00:00,  6.18s/it]


GradientBoostingRegressor:
Mean R-squared score: -0.352 (+/- 3.825)





In [None]:
results_rfe_y_df = pd.DataFrame(results_rfe_y)
results_rfe_y_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_rfe_yearly_results.csv", index=False)

##  SelectFromModel

### tree-based models

In [None]:
# 3. Tree-based feature importance
results_tree_based_y = []
for est_name in ['DecisionTreeRegressor', 'GradientBoostingRegressor']:
    estimator = estimators[est_name]
    estimator.fit(X_scaled, y)
    importances = estimator.feature_importances_
    indices = np.argsort(importances)[-n_features_to_select:]
    selected_features = X.columns[indices].tolist()

    X_new = X_scaled[:, indices]
    for eval_est_name, eval_estimator in estimators.items():
        scores = cross_val_score(eval_estimator, X_new, y, cv=5, scoring='r2', n_jobs=-1)
        print(f"\n{est_name}:")
        print(f"\n{eval_est_name}:")
        print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
        results_tree_based_y.append({
            'Method': f'TreeBased-{est_name}',
            'Estimator': eval_est_name,
            'Selected Features': ', '.join(selected_features),
            'R2 Score': scores.mean(),
            'R2 Std': scores.std()
        })



DecisionTreeRegressor:

RandomForestRegressor:
Mean R-squared score: -0.531 (+/- 4.162)

DecisionTreeRegressor:

RidgeCV:
Mean R-squared score: -6.777 (+/- 26.093)

DecisionTreeRegressor:

DecisionTreeRegressor:
Mean R-squared score: -0.678 (+/- 4.735)

DecisionTreeRegressor:

KNeighborsRegressor:
Mean R-squared score: -0.470 (+/- 3.235)

DecisionTreeRegressor:

GradientBoostingRegressor:
Mean R-squared score: -0.348 (+/- 3.857)

DecisionTreeRegressor:

MLPRegressor:
Mean R-squared score: -4.636 (+/- 15.806)

GradientBoostingRegressor:

RandomForestRegressor:
Mean R-squared score: -0.530 (+/- 4.150)

GradientBoostingRegressor:

RidgeCV:
Mean R-squared score: -6.777 (+/- 26.093)

GradientBoostingRegressor:

DecisionTreeRegressor:
Mean R-squared score: -0.484 (+/- 3.885)

GradientBoostingRegressor:

KNeighborsRegressor:
Mean R-squared score: -0.470 (+/- 3.235)

GradientBoostingRegressor:

GradientBoostingRegressor:
Mean R-squared score: -0.391 (+/- 3.908)

GradientBoostingRegressor:

ML

In [None]:
results_tree_based_y_df = pd.DataFrame(results_tree_based_y)
results_tree_based_y_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_tree_based_yearly_results.csv", index=False)

### Lasso regularization

In [None]:


results_lasso_y = []
# 4. Lasso regularization
lasso_selector = LassoSelector(n_features_to_select=n_features_to_select)
X_new = lasso_selector.fit_transform(X_scaled, y)
selected_features = X.columns[lasso_selector.support_].tolist()

for est_name, estimator in estimators.items():
    scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2', n_jobs=-1)
    print(f"\n{est_name}:")
    print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    results_lasso_y.append({
        'Method': 'Lasso',
        'Estimator': est_name,
        'Selected Features': ', '.join(selected_features),
        'R2 Score': scores.mean(),
        'R2 Std': scores.std()
    })


RandomForestRegressor:
Mean R-squared score: -0.515 (+/- 3.623)

RidgeCV:
Mean R-squared score: -1.715 (+/- 6.799)

DecisionTreeRegressor:
Mean R-squared score: -0.713 (+/- 3.987)

KNeighborsRegressor:
Mean R-squared score: -0.535 (+/- 3.449)

GradientBoostingRegressor:
Mean R-squared score: -0.448 (+/- 3.745)

MLPRegressor:
Mean R-squared score: -1.299 (+/- 6.472)


In [None]:
results_lasso_y_df = pd.DataFrame(results_lasso_y)
results_lasso_y_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_lasso_yearly_results.csv", index=False)

##  Sequential Feature Selection

In [None]:
# Initialize the estimator
estimator = RandomForestRegressor()

# Create the SequentialFeatureSelector
# n_features_to_select: number of features to select
# direction: 'forward' for forward selection, 'backward' for backward selection
sfs = SequentialFeatureSelector(estimator, n_features_to_select=4, direction='forward')

# Fit the selector
sfs.fit(X, y)

# Get the selected feature names
selected_features = X.columns[sfs.get_support()].tolist()

# Create new feature matrix with only selected features
X_new = sfs.transform(X)

# Evaluate the performance
scores = cross_val_score(estimator, X_new, y, cv=5)
print("SequentialFeatureSelector - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SequentialFeatureSelector - Selected features:", selected_features)

SequentialFeatureSelector - Mean cross-validation score: -0.07
SequentialFeatureSelector - Selected features: ['aet_budyko_hargreaves', 'aet_budyko_abtew', 'latitude', 'Elevation']


# decade data

## SelectKBest

In [None]:
# Assuming 'df' is your dataframe and 'target' is your target column
X = decade_data.drop(['ET_fill', 'Site_ID', 'General_classification', 'Land_cover_details', 'Land_cover_type'], axis =1)
y = decade_data['ET_fill']

# Select top k features
selector = SelectKBest(score_func=f_regression, k=4)
X_new = selector.fit_transform(X, y)


# Get selected feature names
selected_features = X.columns[selector.get_support()]


rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)


print("Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SelectKBest - Selected features:", selected_features)

Mean cross-validation score: 0.03
SelectKBest - Selected features: Index(['aet_budyko_oudin', 'aet_budyko_hargreaves', 'aet_budyko_abtew',
       'aet_budyko_mcguinness_bordne'],
      dtype='object')


## RFE

In [None]:


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create and fit RFE
estimator = RandomForestRegressor()
selector = RFE(estimator, n_features_to_select=4, step=1)
X_new = selector.fit_transform(X_scaled, y)

# Get selected feature names
selected_features = X.columns[selector.support_]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)

print("RFE - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("RFE - Selected features:", selected_features)

RFE - Mean cross-validation score: 0.15
RFE - Selected features: Index(['Month', 'aet_budyko_oudin', 'aet_budyko_hargreaves', 'latitude'], dtype='object')


##  SelectFromModel

### tree-based models

In [None]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Fit Random Forest
rf = RandomForestRegressor()
rf.fit(X, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select top 4 features
top_features = feature_importance['feature'][:4].tolist()
X_new = X[top_features]

# Evaluate
scores = cross_val_score(rf, X_new, y, cv=5)
print("Random Forest - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Random Forest - Selected features:", top_features)

Random Forest - Mean cross-validation score: 0.15
Random Forest - Selected features: ['aet_budyko_hargreaves', 'aet_budyko_oudin', 'latitude', 'Month']


### Lasso regularization

In [None]:


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': np.abs(lasso.coef_)})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select features with non-zero coefficients
selected_features = feature_importance[feature_importance['importance'] > 0]['feature'].tolist()
X_new = X[selected_features]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)
print("Lasso - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Lasso - Selected features:", selected_features)

Lasso - Mean cross-validation score: 0.26
Lasso - Selected features: ['aet_budyko_hargreaves', 'latitude', 'Elevation', 'Month']


##  Sequential Feature Selection

In [None]:
# Initialize the estimator
estimator = RandomForestRegressor()

# Create the SequentialFeatureSelector
# n_features_to_select: number of features to select
# direction: 'forward' for forward selection, 'backward' for backward selection
sfs = SequentialFeatureSelector(estimator, n_features_to_select=4, direction='forward')

# Fit the selector
sfs.fit(X, y)

# Get the selected feature names
selected_features = X.columns[sfs.get_support()].tolist()

# Create new feature matrix with only selected features
X_new = sfs.transform(X)

# Evaluate the performance
scores = cross_val_score(estimator, X_new, y, cv=5)
print("SequentialFeatureSelector - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SequentialFeatureSelector - Selected features:", selected_features)