In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!source /content/drive/MyDrive/colab_env/bin/activate

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_selection import SelectKBest, f_regression, RFE, SequentialFeatureSelector
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from functools import partial
from statsmodels.tools.eval_measures import rmspe
from scipy import stats
from sklearn.svm import SVR


In [2]:
daily_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/daily_data_central_valley.parquet", engine='pyarrow')
decade_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/decade_data_central_valley.parquet", engine='pyarrow')
monthly_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/monthly_data_central_valley.parquet", engine='pyarrow')
yearly_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/yearly_data_central_valley.parquet", engine='pyarrow')


In [3]:
monthly_data

Unnamed: 0,Site_ID,Year,Month,aet_budyko_oudin,aet_budyko_hargreaves,aet_budyko_abtew,aet_budyko_mcguinness_bordne,ET_fill,ppt_ameri,ASCE_ETo,ppt,oudin,hargreaves,abtew,mcguinness_bordne,latitude,General_classification,Elevation,Land_cover_details,Land_cover_type
0,Almond_High,2016,10,5.573189,6.394372,6.817488,7.369026,14.524432,37.592,6.034090,22.815,8.364811,9.833769,10.991679,12.296272,36.1697,Croplands,147.0,Almond,Orchards
1,Almond_High,2016,11,3.457981,4.215298,5.127005,4.580734,8.163006,34.544,5.988743,21.596,4.898145,6.435082,8.551687,7.200273,36.1697,Croplands,147.0,Almond,Orchards
2,Almond_High,2016,12,4.731302,5.249017,6.798754,6.190804,6.858637,39.116,4.393247,23.369,6.452863,8.013008,12.095018,9.485709,36.1697,Croplands,147.0,Almond,Orchards
3,Almond_High,2017,1,14.848838,17.150137,23.597011,20.940994,31.763826,182.372,15.648598,109.738,16.918288,19.394778,28.054068,24.869884,36.1697,Croplands,147.0,Almond,Orchards
4,Almond_High,2017,2,15.876717,16.352400,20.478806,21.716244,15.752455,75.692,11.246272,61.703,20.190877,21.947573,28.997933,29.680589,36.1697,Croplands,147.0,Almond,Orchards
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,US-Twt,2016,11,5.199281,5.333597,6.902376,7.107846,6.115304,29.400,6.254415,28.803,7.797869,8.524474,11.685886,11.462867,38.1087,Croplands,-7.0,Rice,Annual crops
430,US-Twt,2016,12,7.733652,8.818499,11.965617,10.497985,5.406630,85.600,2.398865,64.854,9.404073,11.425070,17.105943,13.823988,38.1087,Croplands,-7.0,Rice,Annual crops
431,US-Twt,2017,1,15.152333,15.427181,22.648911,21.743746,15.360403,229.900,10.091391,227.359,15.969643,16.515428,25.217126,23.475375,38.1087,Croplands,-7.0,Rice,Annual crops
432,US-Twt,2017,2,22.347142,20.645603,26.266093,31.124664,19.245805,87.600,9.992841,158.246,26.118873,25.501229,34.377244,38.394743,38.1087,Croplands,-7.0,Rice,Annual crops


In [5]:

era5_monthly_data = pd.read_csv('/content/drive/MyDrive/EC_Tower/result/era5_monthly_data.csv')


In [6]:
# First, let's create a function to map "Point_X" to its corresponding number
def point_to_number(point_str):
    if not point_str.startswith('Point_'):
        return None
    try:
        return int(point_str.split('_')[1]) + 1
    except ValueError:
        return None

In [7]:
dts_data = pd.read_csv("/content/drive/MyDrive/EC_Tower/station_metadata_len.csv")
df = dts_data[:][:161]

df['Latitude'] = df['Latitude'].round(4)
df['Longitude'] = df['Longitude'].round(4)

In [8]:
# Now, apply this function to create the new column
era5_monthly_data['point_id'] = era5_monthly_data['Point'].apply(point_to_number)

stations_id = {
  1 : [-119.58014102395923,36.4587205291358],
  2 : [-120.20100395913578,36.169668221548285],
  3 : [-120.10236864383359,36.94660861179753],
  4 : [-120.20264490832344,36.177667848838176],
  5 : [-120.20999796596058,36.83900158517578],
  6 : [-121.11800025437275,38.28899868843215],
  7 : [-121.49932919916024,38.099152461081694],
  8 : [-121.65310129898836,38.10872169194239],
  9 : [-121.64669802987558,38.11590084463843],
  10 : [-121.53500200758782,38.108998156207704],
  11 : [-121.64330019487288,38.10469958279218],
  12 : [-121.75470191649107,38.03689895127454],
  13 : [-122.02635927086914,38.2005568781381]
}

# Create separate dictionaries for latitude and longitude
lat_dict = {k: v[1] for k, v in stations_id.items()}
lon_dict = {k: v[0] for k, v in stations_id.items()}

# Add new columns
era5_monthly_data['latitude'] = era5_monthly_data['point_id'].map(lat_dict)
era5_monthly_data['longitude'] = era5_monthly_data['point_id'].map(lon_dict)

era5_monthly_data['latitude'] = era5_monthly_data['latitude'].round(4)
era5_monthly_data['longitude'] = era5_monthly_data['longitude'].round(4)

ID_dict = dict(zip(df['Latitude'], df['Site ID']))
era5_monthly_data['Site_ID'] = era5_monthly_data['latitude'].map(ID_dict)
era5_monthly_data

Unnamed: 0,Point,Variable,Date,Value,point_id,latitude,longitude,Site_ID
0,Point_0,dewpoint_temperature_2m,200812,276.433386,1,36.4587,-119.5801,MB_Pch
1,Point_0,dewpoint_temperature_2m,200901,278.279480,1,36.4587,-119.5801,MB_Pch
2,Point_0,dewpoint_temperature_2m,200902,279.188978,1,36.4587,-119.5801,MB_Pch
3,Point_0,dewpoint_temperature_2m,200903,278.190431,1,36.4587,-119.5801,MB_Pch
4,Point_0,dewpoint_temperature_2m,200904,275.580090,1,36.4587,-119.5801,MB_Pch
...,...,...,...,...,...,...,...,...
282745,Point_12,leaf_area_index_low_vegetation_max,202008,1.715332,13,38.2006,-122.0264,US-Srr
282746,Point_12,leaf_area_index_low_vegetation_max,202009,1.663086,13,38.2006,-122.0264,US-Srr
282747,Point_12,leaf_area_index_low_vegetation_max,202010,1.586792,13,38.2006,-122.0264,US-Srr
282748,Point_12,leaf_area_index_low_vegetation_max,202011,1.535522,13,38.2006,-122.0264,US-Srr


In [11]:
wide_era5_monthly_data = era5_monthly_data.pivot(index=['Point', 'point_id', 'Site_ID', 'latitude', 'longitude', 'Date'], columns='Variable', values='Value').reset_index()

# Convert the 'date' column to string type (if it's not already)
wide_era5_monthly_data['Date'] = wide_era5_monthly_data['Date'].astype(str)

# Create new 'year' column
wide_era5_monthly_data['Year'] = wide_era5_monthly_data['Date'].str[:4].astype(int)

# Create new 'month' column
wide_era5_monthly_data['Month'] = wide_era5_monthly_data['Date'].str[4:].astype(int)
wide_era5_monthly_data

Variable,Point,point_id,Site_ID,latitude,longitude,Date,dewpoint_temperature_2m,dewpoint_temperature_2m_max,dewpoint_temperature_2m_min,evaporation_from_bare_soil_max,...,volumetric_soil_water_layer_2_max,volumetric_soil_water_layer_2_min,volumetric_soil_water_layer_3,volumetric_soil_water_layer_3_max,volumetric_soil_water_layer_3_min,volumetric_soil_water_layer_4,volumetric_soil_water_layer_4_max,volumetric_soil_water_layer_4_min,Year,Month
0,Point_0,1,MB_Pch,36.4587,-119.5801,200812,276.433386,282.875626,269.296585,4.656613e-10,...,0.331512,0.188095,0.155579,0.167221,0.152695,0.175210,0.175323,0.175125,2008,12
1,Point_0,1,MB_Pch,36.4587,-119.5801,200901,278.279480,286.926056,272.556259,2.328306e-10,...,0.302643,0.239319,0.174512,0.181427,0.167267,0.175053,0.175125,0.174973,2009,1
2,Point_0,1,MB_Pch,36.4587,-119.5801,200902,279.188978,287.090515,270.089157,2.328306e-10,...,0.383301,0.264297,0.205223,0.237808,0.181427,0.174979,0.175125,0.174927,2009,2
3,Point_0,1,MB_Pch,36.4587,-119.5801,200903,278.190431,287.185394,264.508545,0.000000e+00,...,0.351669,0.220535,0.239431,0.245285,0.228638,0.175518,0.175766,0.175125,2009,3
4,Point_0,1,MB_Pch,36.4587,-119.5801,200904,275.580090,282.904755,261.592682,0.000000e+00,...,0.220093,0.160309,0.211857,0.228516,0.194092,0.175590,0.175751,0.175247,2009,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,Point_9,10,US-Bi2,38.1090,-121.5350,202008,286.338399,290.738434,279.741028,-4.051253e-08,...,0.163483,0.162643,0.172163,0.174622,0.170288,0.262490,0.265045,0.260025,2020,8
1881,Point_9,10,US-Bi2,38.1090,-121.5350,202009,285.962769,292.701843,274.092133,0.000000e+00,...,0.162643,0.161880,0.169509,0.170319,0.168915,0.258145,0.260025,0.256363,2020,9
1882,Point_9,10,US-Bi2,38.1090,-121.5350,202010,281.515198,289.958252,259.269516,0.000000e+00,...,0.161880,0.161179,0.168667,0.168930,0.168533,0.254885,0.256363,0.253616,2020,10
1883,Point_9,10,US-Bi2,38.1090,-121.5350,202011,278.166978,287.128830,264.423264,2.328306e-10,...,0.189270,0.160950,0.168947,0.169449,0.168594,0.252882,0.253601,0.252258,2020,11


In [18]:
monthly_data = pd.merge(monthly_data, wide_era5_monthly_data.drop(['latitude', 'Point','point_id', 'Date'], axis =1),
                  on=['Site_ID', 'Year', 'Month'],
                  how='left')






In [19]:
# Factorize the column
monthly_data['category_encoded_Site_ID'], _ = pd.factorize(monthly_data['Site_ID'])
monthly_data['category_encoded_General_classification'], _ = pd.factorize(monthly_data['General_classification'])
monthly_data['category_encoded_Land_cover_details'], _ = pd.factorize(monthly_data['Land_cover_details'])
monthly_data['category_encoded_Land_cover_type'], _ = pd.factorize(monthly_data['Land_cover_type'])


In [20]:
# Custom transformer for Lasso feature selection
class LassoSelector(BaseEstimator, TransformerMixin):
    def __init__(self, n_features_to_select=10):
        self.n_features_to_select = n_features_to_select
        self.lasso = Lasso(random_state=42)

    def fit(self, X, y):
        self.lasso.fit(X, y)
        self.support_ = np.argsort(np.abs(self.lasso.coef_))[-self.n_features_to_select:]
        return self

    def transform(self, X):
        return X[:, self.support_]


In [None]:
# 'ASCE_ETo'
# 'evaporation_from_bare_soil_max',
#  'evaporation_from_bare_soil_min',
#  'evaporation_from_bare_soil_sum',
#  'evaporation_from_open_water_surfaces_excluding_oceans_max',
#  'evaporation_from_open_water_surfaces_excluding_oceans_min',
#  'evaporation_from_open_water_surfaces_excluding_oceans_sum',
#  'evaporation_from_the_top_of_canopy_max',
#  'evaporation_from_the_top_of_canopy_min',
#  'evaporation_from_the_top_of_canopy_sum',
#  'evaporation_from_vegetation_transpiration_max',
#  'evaporation_from_vegetation_transpiration_min',
#  'evaporation_from_vegetation_transpiration_sum',
#  'potential_evaporation_max',
#  'potential_evaporation_min',
#  'potential_evaporation_sum',
#  'total_evaporation_max',
#  'total_evaporation_min',
#  'total_evaporation_sum',

In [61]:
# Assuming 'df' is your dataframe and 'target' is your target column
# X = monthly_data.drop(['ET_fill',
#                        'Site_ID',
#                        'General_classification',
#                        'Land_cover_details',
#                        'Land_cover_type',
#                        'aet_budyko_oudin',
#                        'aet_budyko_hargreaves',
#                        'aet_budyko_abtew',
#                        'aet_budyko_mcguinness_bordne',
#                        'oudin',
#                        'hargreaves',
#                        'abtew',
#                        'mcguinness_bordne',], axis =1)

X = monthly_data.drop(['ET_fill',
                       'Site_ID',
                       'General_classification',
                       'Land_cover_details',
                       'Land_cover_type',
                       'aet_budyko_oudin',
                       'aet_budyko_hargreaves',
                       'aet_budyko_abtew',
                       'aet_budyko_mcguinness_bordne',
                       'oudin',
                       'hargreaves',
                       'abtew',
                       'mcguinness_bordne',
                       'ASCE_ETo',
                       'evaporation_from_bare_soil_max',
                       'evaporation_from_bare_soil_min',
                       'evaporation_from_bare_soil_sum',
                       'evaporation_from_open_water_surfaces_excluding_oceans_max',
                       'evaporation_from_open_water_surfaces_excluding_oceans_min',
                       'evaporation_from_open_water_surfaces_excluding_oceans_sum',
                       'evaporation_from_the_top_of_canopy_max',
                       'evaporation_from_the_top_of_canopy_min',
                       'evaporation_from_the_top_of_canopy_sum',
                       'evaporation_from_vegetation_transpiration_max',
                       'evaporation_from_vegetation_transpiration_min',
                       'evaporation_from_vegetation_transpiration_sum',
                       'potential_evaporation_max',
                       'potential_evaporation_min',
                       'potential_evaporation_sum',
                       'total_evaporation_max',
                       'total_evaporation_min',
                       'total_evaporation_sum'], axis =1)

y = monthly_data['ET_fill']



# Define estimators
estimators = {
    'RandomForestRegressor': RandomForestRegressor(),
    'RidgeCV': RidgeCV(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'MLPRegressor': MLPRegressor()
}

# Number of features to select
n_features_to_select = 7

In [62]:

results_rfe = []
x_tr, x_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=3)
for est_name, estimator in tqdm(estimators.items()):
  if est_name not in ['KNeighborsRegressor', 'MLPRegressor']:
    selector = RFE(estimator, n_features_to_select=n_features_to_select)
    selector.fit(x_tr, y_tr)
    selected_features = X.columns[selector.support_]
    print(selected_features)
    y_tr_pred = selector.predict(x_tr)
    # train
    print(f"{est_name} in train")
    print('Mean Absolute Error:', mean_absolute_error(y_tr, y_tr_pred))
    print('Mean Squared Error:', mean_squared_error(y_tr, y_tr_pred))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_tr, y_tr_pred)))
    print('Root Mean Squared Percentage Error:', rmspe(y_tr, y_tr_pred))
    print('R2 Score:', (stats.pearsonr(y_tr_pred,y_tr)[0])**2)

    # test
    print(f"{est_name} in test")
    y_te_pred = selector.predict(x_te)
    print('Mean Absolute Error:', mean_absolute_error(y_te, y_te_pred))
    print('Mean Squared Error:', mean_squared_error(y_te, y_te_pred))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_te, y_te_pred)))
    print('Root Mean Squared Percentage Error:', rmspe(y_te, y_te_pred))
    print('R2 Score:', (stats.pearsonr(y_te_pred,y_te)[0])**2)

    results_rfe.append({
        'Method': 'RFE',
        'Estimator': est_name,
        'Selected Features': ', '.join(selected_features),
        'MAE_train':  mean_absolute_error(y_tr, y_tr_pred),
        'MSE_train': mean_squared_error(y_tr, y_tr_pred),
        'RMSE_train' : np.sqrt(mean_squared_error(y_tr, y_tr_pred)),
        'RMSPE_train' : rmspe(y_tr, y_tr_pred),
        'R2_train' : (stats.pearsonr(y_tr_pred,y_tr)[0])**2,
        'MAE_test':  mean_absolute_error(y_te, y_te_pred),
        'MSE_test': mean_squared_error(y_te, y_te_pred),
        'RMSE_test' : np.sqrt(mean_squared_error(y_te, y_te_pred)),
        'RMSPE_test' : rmspe(y_te, y_te_pred),
        'R2_test' : (stats.pearsonr(y_te_pred,y_te)[0])**2
        })

 17%|█▋        | 1/6 [04:13<21:09, 253.85s/it]

Index(['ppt_ameri', 'ppt', 'latitude', 'surface_latent_heat_flux_min',
       'surface_latent_heat_flux_sum', 'surface_pressure_max',
       'volumetric_soil_water_layer_1'],
      dtype='object')
RandomForestRegressor in train
Mean Absolute Error: 1.3178636679567524
Mean Squared Error: 3.352426306313558
Root Mean Squared Error: 1.8309632181760391
Root Mean Squared Percentage Error: 5.067359202090018
R2 Score: 0.9659092977417335
RandomForestRegressor in test
Mean Absolute Error: 3.3443883249458297
Mean Squared Error: 22.200814007912715
Root Mean Squared Error: 4.711773976743018
Root Mean Squared Percentage Error: 15.311434012576004
R2 Score: 0.8291604366484906


 33%|███▎      | 2/6 [04:17<07:07, 106.82s/it]

Index(['latitude', 'leaf_area_index_low_vegetation',
       'leaf_area_index_low_vegetation_max', 'runoff_sum',
       'surface_runoff_sum', 'total_precipitation_sum',
       'volumetric_soil_water_layer_3_min'],
      dtype='object')
RidgeCV in train
Mean Absolute Error: 4.985331770612822
Mean Squared Error: 46.08969386226889
Root Mean Squared Error: 6.78893908223287
Root Mean Squared Percentage Error: 21.971484922328337
R2 Score: 0.36422834599571285
RidgeCV in test
Mean Absolute Error: 6.02200271255634
Mean Squared Error: 68.69448972176508
Root Mean Squared Error: 8.28821390419945
Root Mean Squared Percentage Error: 50.62898099891951
R2 Score: 0.466991743428463


 50%|█████     | 3/6 [04:21<02:58, 59.66s/it] 

Index(['ppt_ameri', 'leaf_area_index_high_vegetation',
       'leaf_area_index_low_vegetation', 'surface_latent_heat_flux_min',
       'surface_runoff_sum', 'volumetric_soil_water_layer_1',
       'volumetric_soil_water_layer_2_min'],
      dtype='object')
DecisionTreeRegressor in train
Mean Absolute Error: 0.03365607514096156
Mean Squared Error: 0.07172201266416721
Root Mean Squared Error: 0.2678096575259511
Root Mean Squared Percentage Error: 0.31521148012109357
R2 Score: 0.999007424446795
DecisionTreeRegressor in test
Mean Absolute Error: 3.878086428006864
Mean Squared Error: 29.241813001242672
Root Mean Squared Error: 5.407569971922941
Root Mean Squared Percentage Error: 12.579212027073964
R2 Score: 0.7468697562922747


100%|██████████| 6/6 [06:30<00:00, 65.04s/it]

Index(['ppt_ameri', 'ppt', 'latitude', 'leaf_area_index_high_vegetation_min',
       'surface_latent_heat_flux_min', 'surface_latent_heat_flux_sum',
       'total_precipitation_sum'],
      dtype='object')
GradientBoostingRegressor in train
Mean Absolute Error: 1.7545031579033512
Mean Squared Error: 5.010739994766547
Root Mean Squared Error: 2.2384682250964714
Root Mean Squared Percentage Error: 7.363364338833029
R2 Score: 0.9352781294437628
GradientBoostingRegressor in test
Mean Absolute Error: 3.3952312725154803
Mean Squared Error: 20.54215423273581
Root Mean Squared Error: 4.532345334673408
Root Mean Squared Percentage Error: 23.911558072445214
R2 Score: 0.8224803863740062





In [63]:
results_rfe_output = pd.DataFrame(results_rfe)
results_rfe_output.to_csv('/content/drive/MyDrive/EC_Tower/result/output_model_ML_7features_just_product.csv', index=False)

In [None]:
x_tr, x_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=3)

m = RandomForestRegressor(n_estimators=100)

rfr = RFE(
    estimator=m,
    n_features_to_select=4
)

rfr.fit(x_tr, y_tr)

# rfr = RFECV(
#     estimator=m,
# )

# rfr.fit(x_tr, y_tr)

print("Selected Features:")
print(X.columns[rfr.support_])


y_tr_pred = rfr.predict(x_tr)
print('Mean Absolute Error:', mean_absolute_error(y_tr, y_tr_pred))
print('Mean Squared Error:', mean_squared_error(y_tr, y_tr_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_tr, y_tr_pred)))
print('Root Mean Squared Percentage Error:', rmspe(y_tr, y_tr_pred))
print('R2 Score:', r2_score(y_tr, y_tr_pred))

Selected Features:
Index(['oudin', 'hargreaves', 'abtew', 'latitude'], dtype='object')
Mean Absolute Error: 0.9509109595109821
Mean Squared Error: 1.8198556051307118
Root Mean Squared Error: 1.3490202389626005
Root Mean Squared Percentage Error: 2.7575123744940346
R2 Score: 0.9748146473179244


In [None]:
import plotly.express as px

fig = px.scatter(
    x = y_tr_pred,
    y = y_tr,
)

fig.add_shape(
    type='line',
    x0 = 0,
    y0 = 0,
    x1 = 12,
    y1 = 12,
    line = dict(
        color = 'Red',
    )
)

fig.update_layout(
    autosize = False,
    width = 600,
    height = 600,
    xaxis_range = [0, 12],
    yaxis_range = [0, 12],
    xaxis_title = dict(text = 'ET Predicted'),
    yaxis_title = dict(text = 'ET')
)

fig.show()

In [None]:
y_te_pred = rfr.predict(x_te)

fig = px.scatter(
    x = y_te_pred,
    y = y_te,
)

fig.add_shape(
    type='line',
    x0 = 0,
    y0 = 0,
    x1 = 12,
    y1 = 12,
    line = dict(
        color = 'Red',
    )
)

fig.update_layout(
    autosize = False,
    width = 600,
    height = 600,
    xaxis_range = [0, 12],
    yaxis_range = [0, 12],
    xaxis_title = dict(text = 'ET Predicted'),
    yaxis_title = dict(text = 'ET')
)

fig.show()

# Monthly data

## SelectKBest

In [None]:
# 1. SelectKBest (optimized)
results_kbest = []
selector = SelectKBest(score_func=f_regression, k=n_features_to_select)
X_new = selector.fit_transform(X_scaled, y)
selected_features = X.columns[selector.get_support()].tolist()

for est_name, estimator in tqdm(estimators.items()):

  scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2')
  print(f"\n{est_name}:")
  print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
  results_kbest.append({
      'Method': 'SelectKBest',
      'Estimator': est_name,
      'Selected Features': ', '.join(selected_features),
      'R2 Score': scores.mean(),
      'R2 Std': scores.std()
  })




 17%|█▋        | 1/6 [00:41<03:29, 41.89s/it]


RandomForestRegressor:
Mean R-squared score: 0.085 (+/- 1.518)


 33%|███▎      | 2/6 [00:42<01:09, 17.48s/it]


RidgeCV:
Mean R-squared score: 0.469 (+/- 0.910)


 50%|█████     | 3/6 [00:42<00:29,  9.76s/it]


DecisionTreeRegressor:
Mean R-squared score: 0.043 (+/- 1.589)


 67%|██████▋   | 4/6 [00:44<00:13,  6.57s/it]


KNeighborsRegressor:
Mean R-squared score: 0.018 (+/- 1.549)


 83%|████████▎ | 5/6 [01:11<00:13, 13.85s/it]


GradientBoostingRegressor:
Mean R-squared score: 0.387 (+/- 1.059)


100%|██████████| 6/6 [14:03<00:00, 140.60s/it]


MLPRegressor:
Mean R-squared score: 0.406 (+/- 1.002)





## RFE

In [None]:
# 2. RFE (not suitable for KNN)
results_rfe = []
for est_name, estimator in tqdm(estimators.items()):
    if est_name not in ['KNeighborsRegressor', 'MLPRegressor']:
        selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
        X_new = selector.fit_transform(X_scaled, y)
        selected_features = X.columns[selector.support_].tolist()

        scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2')
        print(f"\n{est_name}:")
        print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
        results_rfe.append({
            'Method': 'RFE',
            'Estimator': est_name,
            'Selected Features': ', '.join(selected_features),
            'R2 Score': scores.mean(),
            'R2 Std': scores.std()
        })


 17%|█▋        | 1/6 [02:14<11:12, 134.46s/it]


RandomForestRegressor:
Mean R-squared score: 0.392 (+/- 0.779)


 33%|███▎      | 2/6 [02:16<03:46, 56.59s/it] 


RidgeCV:
Mean R-squared score: -1.516 (+/- 7.927)


 50%|█████     | 3/6 [02:19<01:36, 32.12s/it]


DecisionTreeRegressor:
Mean R-squared score: 0.357 (+/- 0.821)


100%|██████████| 6/6 [03:43<00:00, 37.20s/it]


GradientBoostingRegressor:
Mean R-squared score: 0.523 (+/- 0.431)





##  SelectFromModel

### tree-based models

In [None]:

# 3. Tree-based feature importance
results_tree_based = []
for est_name in ['DecisionTreeRegressor', 'GradientBoostingRegressor']:
    estimator = estimators[est_name]
    estimator.fit(X_scaled, y)
    importances = estimator.feature_importances_
    indices = np.argsort(importances)[-n_features_to_select:]
    selected_features = X.columns[indices].tolist()

    X_new = X_scaled[:, indices]
    for eval_est_name, eval_estimator in estimators.items():
        scores = cross_val_score(eval_estimator, X_new, y, cv=5, scoring='r2', n_jobs=-1)
        print(f"\n{est_name}:")
        print(f"\n{eval_est_name}:")
        print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
        results_tree_based.append({
            'Method': f'TreeBased-{est_name}',
            'Estimator': eval_est_name,
            'Selected Features': ', '.join(selected_features),
            'R2 Score': scores.mean(),
            'R2 Std': scores.std()
        })



DecisionTreeRegressor:

RandomForestRegressor:
Mean R-squared score: 0.393 (+/- 0.792)

DecisionTreeRegressor:

RidgeCV:
Mean R-squared score: -1.900 (+/- 9.084)

DecisionTreeRegressor:

DecisionTreeRegressor:
Mean R-squared score: 0.300 (+/- 0.845)

DecisionTreeRegressor:

KNeighborsRegressor:
Mean R-squared score: 0.244 (+/- 1.259)

DecisionTreeRegressor:

GradientBoostingRegressor:
Mean R-squared score: 0.714 (+/- 0.122)

DecisionTreeRegressor:

MLPRegressor:
Mean R-squared score: -2.438 (+/- 11.935)

GradientBoostingRegressor:

RandomForestRegressor:
Mean R-squared score: 0.180 (+/- 1.085)

GradientBoostingRegressor:

RidgeCV:
Mean R-squared score: 0.430 (+/- 0.954)

GradientBoostingRegressor:

DecisionTreeRegressor:
Mean R-squared score: 0.077 (+/- 1.286)

GradientBoostingRegressor:

KNeighborsRegressor:
Mean R-squared score: -0.003 (+/- 1.759)

GradientBoostingRegressor:

GradientBoostingRegressor:
Mean R-squared score: 0.407 (+/- 1.108)

GradientBoostingRegressor:

MLPRegressor

### Lasso regularization

In [None]:

results_lasso = []
# 4. Lasso regularization
lasso_selector = LassoSelector(n_features_to_select=n_features_to_select)
X_new = lasso_selector.fit_transform(X_scaled, y)
selected_features = X.columns[lasso_selector.support_].tolist()

for est_name, estimator in estimators.items():
    scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2', n_jobs=-1)
    print(f"\n{est_name}:")
    print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    results_lasso.append({
        'Method': 'Lasso',
        'Estimator': est_name,
        'Selected Features': ', '.join(selected_features),
        'R2 Score': scores.mean(),
        'R2 Std': scores.std()
    })


RandomForestRegressor:
Mean R-squared score: 0.325 (+/- 0.948)

RidgeCV:
Mean R-squared score: -1.517 (+/- 7.645)

DecisionTreeRegressor:
Mean R-squared score: 0.285 (+/- 0.683)

KNeighborsRegressor:
Mean R-squared score: 0.348 (+/- 1.035)

GradientBoostingRegressor:
Mean R-squared score: 0.672 (+/- 0.238)

MLPRegressor:
Mean R-squared score: -5.996 (+/- 26.207)


##  Sequential Feature Selection

In [None]:
results_sequential_forward = []
# 5. Sequential Feature Selection (forward)
for est_name, estimator in estimators.items():
    selector = SequentialFeatureSelector(estimator, n_features_to_select=n_features_to_select, direction='forward', n_jobs=-1)
    X_new = selector.fit_transform(X_scaled, y)
    selected_features = X.columns[selector.get_support()].tolist()

    scores = cross_val_score(estimator, X_new, y, cv=3, scoring='r2', n_jobs=-1)
    print(f"\n{est_name}:")
    print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    results_sequential_forward.append({
        'Method': 'SequentialForward',
        'Estimator': est_name,
        'Direction' : 'forward',
        'Selected Features': ', '.join(selected_features),
        'R2 Score': scores.mean(),
        'R2 Std': scores.std()
    })

KeyboardInterrupt: 

In [None]:
results_kbest_df = pd.DataFrame(results_kbest)
results_kbest_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_kbest_results.csv", index=False)


results_rfe_df = pd.DataFrame(results_rfe)
results_rfe_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_rfe_results.csv", index=False)


results_tree_based_df = pd.DataFrame(results_tree_based)
results_tree_based_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_tree_based_results.csv", index=False)


results_lasso_df = pd.DataFrame(results_lasso)
results_lasso_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_lasso_results.csv", index=False)

# yearly data

In [None]:
# Factorize the column
yearly_data['category_encoded_Site_ID'], _ = pd.factorize(yearly_data['Site_ID'])
yearly_data['category_encoded_General_classification'], _ = pd.factorize(yearly_data['General_classification'])
yearly_data['category_encoded_Land_cover_details'], _ = pd.factorize(yearly_data['Land_cover_details'])
yearly_data['category_encoded_Land_cover_type'], _ = pd.factorize(yearly_data['Land_cover_type'])

In [None]:
# Assuming 'df' is your dataframe and 'target' is your target column
X = yearly_data.drop(['ET_fill', 'Site_ID', 'General_classification', 'Land_cover_details', 'Land_cover_type'], axis =1)
y = yearly_data['ET_fill']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define estimators
estimators = {
    'RandomForestRegressor': RandomForestRegressor(),
    'RidgeCV': RidgeCV(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'MLPRegressor': MLPRegressor()
}

# Number of features to select
n_features_to_select = 4

## SelectKBest

In [None]:
# 1. SelectKBest (optimized)
results_kbest_y = []
selector = SelectKBest(score_func=f_regression, k=n_features_to_select)
X_new = selector.fit_transform(X_scaled, y)
selected_features = X.columns[selector.get_support()].tolist()

for est_name, estimator in tqdm(estimators.items()):

  scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2')
  print(f"\n{est_name}:")
  print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
  results_kbest_y.append({
      'Method': 'SelectKBest',
      'Estimator': est_name,
      'Selected Features': ', '.join(selected_features),
      'R2 Score': scores.mean(),
      'R2 Std': scores.std()
  })


 17%|█▋        | 1/6 [00:02<00:14,  2.89s/it]


RandomForestRegressor:
Mean R-squared score: -1.277 (+/- 4.718)

RidgeCV:
Mean R-squared score: -0.311 (+/- 3.145)


 67%|██████▋   | 4/6 [00:03<00:01,  1.65it/s]


DecisionTreeRegressor:
Mean R-squared score: -1.529 (+/- 3.609)

KNeighborsRegressor:
Mean R-squared score: -1.487 (+/- 5.828)


 83%|████████▎ | 5/6 [00:05<00:01,  1.16s/it]


GradientBoostingRegressor:
Mean R-squared score: -1.096 (+/- 5.076)


100%|██████████| 6/6 [01:27<00:00, 14.61s/it]


MLPRegressor:
Mean R-squared score: -0.226 (+/- 3.164)





In [None]:
results_kbest_y_df = pd.DataFrame(results_kbest_y)
results_kbest_y_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_kbest_yearly_results.csv", index=False)

## RFE

In [None]:
# 2. RFE (not suitable for KNN)
results_rfe_y = []
for est_name, estimator in tqdm(estimators.items()):
    if est_name not in ['KNeighborsRegressor', 'MLPRegressor']:
        selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
        X_new = selector.fit_transform(X_scaled, y)
        selected_features = X.columns[selector.support_].tolist()

        scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2')
        print(f"\n{est_name}:")
        print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
        results_rfe_y.append({
            'Method': 'RFE',
            'Estimator': est_name,
            'Selected Features': ', '.join(selected_features),
            'R2 Score': scores.mean(),
            'R2 Std': scores.std()
        })

 17%|█▋        | 1/6 [00:28<02:24, 28.88s/it]


RandomForestRegressor:
Mean R-squared score: -0.539 (+/- 4.188)


 50%|█████     | 3/6 [00:29<00:19,  6.63s/it]


RidgeCV:
Mean R-squared score: -6.777 (+/- 26.093)

DecisionTreeRegressor:
Mean R-squared score: -0.659 (+/- 3.641)


100%|██████████| 6/6 [00:37<00:00,  6.18s/it]


GradientBoostingRegressor:
Mean R-squared score: -0.352 (+/- 3.825)





In [None]:
results_rfe_y_df = pd.DataFrame(results_rfe_y)
results_rfe_y_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_rfe_yearly_results.csv", index=False)

##  SelectFromModel

### tree-based models

In [None]:
# 3. Tree-based feature importance
results_tree_based_y = []
for est_name in ['DecisionTreeRegressor', 'GradientBoostingRegressor']:
    estimator = estimators[est_name]
    estimator.fit(X_scaled, y)
    importances = estimator.feature_importances_
    indices = np.argsort(importances)[-n_features_to_select:]
    selected_features = X.columns[indices].tolist()

    X_new = X_scaled[:, indices]
    for eval_est_name, eval_estimator in estimators.items():
        scores = cross_val_score(eval_estimator, X_new, y, cv=5, scoring='r2', n_jobs=-1)
        print(f"\n{est_name}:")
        print(f"\n{eval_est_name}:")
        print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
        results_tree_based_y.append({
            'Method': f'TreeBased-{est_name}',
            'Estimator': eval_est_name,
            'Selected Features': ', '.join(selected_features),
            'R2 Score': scores.mean(),
            'R2 Std': scores.std()
        })



DecisionTreeRegressor:

RandomForestRegressor:
Mean R-squared score: -0.531 (+/- 4.162)

DecisionTreeRegressor:

RidgeCV:
Mean R-squared score: -6.777 (+/- 26.093)

DecisionTreeRegressor:

DecisionTreeRegressor:
Mean R-squared score: -0.678 (+/- 4.735)

DecisionTreeRegressor:

KNeighborsRegressor:
Mean R-squared score: -0.470 (+/- 3.235)

DecisionTreeRegressor:

GradientBoostingRegressor:
Mean R-squared score: -0.348 (+/- 3.857)

DecisionTreeRegressor:

MLPRegressor:
Mean R-squared score: -4.636 (+/- 15.806)

GradientBoostingRegressor:

RandomForestRegressor:
Mean R-squared score: -0.530 (+/- 4.150)

GradientBoostingRegressor:

RidgeCV:
Mean R-squared score: -6.777 (+/- 26.093)

GradientBoostingRegressor:

DecisionTreeRegressor:
Mean R-squared score: -0.484 (+/- 3.885)

GradientBoostingRegressor:

KNeighborsRegressor:
Mean R-squared score: -0.470 (+/- 3.235)

GradientBoostingRegressor:

GradientBoostingRegressor:
Mean R-squared score: -0.391 (+/- 3.908)

GradientBoostingRegressor:

ML

In [None]:
results_tree_based_y_df = pd.DataFrame(results_tree_based_y)
results_tree_based_y_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_tree_based_yearly_results.csv", index=False)

### Lasso regularization

In [None]:


results_lasso_y = []
# 4. Lasso regularization
lasso_selector = LassoSelector(n_features_to_select=n_features_to_select)
X_new = lasso_selector.fit_transform(X_scaled, y)
selected_features = X.columns[lasso_selector.support_].tolist()

for est_name, estimator in estimators.items():
    scores = cross_val_score(estimator, X_new, y, cv=5, scoring='r2', n_jobs=-1)
    print(f"\n{est_name}:")
    print(f"Mean R-squared score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    results_lasso_y.append({
        'Method': 'Lasso',
        'Estimator': est_name,
        'Selected Features': ', '.join(selected_features),
        'R2 Score': scores.mean(),
        'R2 Std': scores.std()
    })


RandomForestRegressor:
Mean R-squared score: -0.515 (+/- 3.623)

RidgeCV:
Mean R-squared score: -1.715 (+/- 6.799)

DecisionTreeRegressor:
Mean R-squared score: -0.713 (+/- 3.987)

KNeighborsRegressor:
Mean R-squared score: -0.535 (+/- 3.449)

GradientBoostingRegressor:
Mean R-squared score: -0.448 (+/- 3.745)

MLPRegressor:
Mean R-squared score: -1.299 (+/- 6.472)


In [None]:
results_lasso_y_df = pd.DataFrame(results_lasso_y)
results_lasso_y_df.to_csv("/content/drive/MyDrive/EC_Tower/result/feature_selection_lasso_yearly_results.csv", index=False)

##  Sequential Feature Selection

In [None]:
# Initialize the estimator
estimator = RandomForestRegressor()

# Create the SequentialFeatureSelector
# n_features_to_select: number of features to select
# direction: 'forward' for forward selection, 'backward' for backward selection
sfs = SequentialFeatureSelector(estimator, n_features_to_select=4, direction='forward')

# Fit the selector
sfs.fit(X, y)

# Get the selected feature names
selected_features = X.columns[sfs.get_support()].tolist()

# Create new feature matrix with only selected features
X_new = sfs.transform(X)

# Evaluate the performance
scores = cross_val_score(estimator, X_new, y, cv=5)
print("SequentialFeatureSelector - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SequentialFeatureSelector - Selected features:", selected_features)

SequentialFeatureSelector - Mean cross-validation score: -0.07
SequentialFeatureSelector - Selected features: ['aet_budyko_hargreaves', 'aet_budyko_abtew', 'latitude', 'Elevation']


# decade data

## SelectKBest

In [None]:
# Assuming 'df' is your dataframe and 'target' is your target column
X = decade_data.drop(['ET_fill', 'Site_ID', 'General_classification', 'Land_cover_details', 'Land_cover_type'], axis =1)
y = decade_data['ET_fill']

# Select top k features
selector = SelectKBest(score_func=f_regression, k=4)
X_new = selector.fit_transform(X, y)


# Get selected feature names
selected_features = X.columns[selector.get_support()]


rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)


print("Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SelectKBest - Selected features:", selected_features)

Mean cross-validation score: 0.03
SelectKBest - Selected features: Index(['aet_budyko_oudin', 'aet_budyko_hargreaves', 'aet_budyko_abtew',
       'aet_budyko_mcguinness_bordne'],
      dtype='object')


## RFE

In [None]:


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create and fit RFE
estimator = RandomForestRegressor()
selector = RFE(estimator, n_features_to_select=4, step=1)
X_new = selector.fit_transform(X_scaled, y)

# Get selected feature names
selected_features = X.columns[selector.support_]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)

print("RFE - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("RFE - Selected features:", selected_features)

RFE - Mean cross-validation score: 0.15
RFE - Selected features: Index(['Month', 'aet_budyko_oudin', 'aet_budyko_hargreaves', 'latitude'], dtype='object')


##  SelectFromModel

### tree-based models

In [None]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Fit Random Forest
rf = RandomForestRegressor()
rf.fit(X, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select top 4 features
top_features = feature_importance['feature'][:4].tolist()
X_new = X[top_features]

# Evaluate
scores = cross_val_score(rf, X_new, y, cv=5)
print("Random Forest - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Random Forest - Selected features:", top_features)

Random Forest - Mean cross-validation score: 0.15
Random Forest - Selected features: ['aet_budyko_hargreaves', 'aet_budyko_oudin', 'latitude', 'Month']


### Lasso regularization

In [None]:


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': np.abs(lasso.coef_)})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select features with non-zero coefficients
selected_features = feature_importance[feature_importance['importance'] > 0]['feature'].tolist()
X_new = X[selected_features]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)
print("Lasso - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Lasso - Selected features:", selected_features)

Lasso - Mean cross-validation score: 0.26
Lasso - Selected features: ['aet_budyko_hargreaves', 'latitude', 'Elevation', 'Month']


##  Sequential Feature Selection

In [None]:
# Initialize the estimator
estimator = RandomForestRegressor()

# Create the SequentialFeatureSelector
# n_features_to_select: number of features to select
# direction: 'forward' for forward selection, 'backward' for backward selection
sfs = SequentialFeatureSelector(estimator, n_features_to_select=4, direction='forward')

# Fit the selector
sfs.fit(X, y)

# Get the selected feature names
selected_features = X.columns[sfs.get_support()].tolist()

# Create new feature matrix with only selected features
X_new = sfs.transform(X)

# Evaluate the performance
scores = cross_val_score(estimator, X_new, y, cv=5)
print("SequentialFeatureSelector - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SequentialFeatureSelector - Selected features:", selected_features)