In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!source /content/drive/MyDrive/colab_env/bin/activate

In [3]:
import pandas as pd
import pyarrow
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE, SelectFromModel, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
import seaborn as sns

In [4]:
daily_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/daily_data_central_valley.parquet", engine='pyarrow')
decade_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/decade_data_central_valley.parquet", engine='pyarrow')
monthly_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/monthly_data_central_valley.parquet", engine='pyarrow')
yearly_data = pd.read_parquet("/content/drive/MyDrive/EC_Tower/result/yearly_data_central_valley.parquet", engine='pyarrow')


# Monthly data

## SelectKBest

In [5]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
# Assuming 'df' is your dataframe and 'target' is your target column
X = monthly_data.drop(['ET_fill', 'Site_ID', 'General_classification', 'Land_cover_details', 'Land_cover_type'], axis =1)
y = monthly_data['ET_fill']

# Select top k features
selector = SelectKBest(score_func=f_regression, k=4)
X_new = selector.fit_transform(X, y)


# Get selected feature names
selected_features = X.columns[selector.get_support()]


rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)


print("Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SelectKBest - Selected features:", selected_features)

Mean cross-validation score: 0.06
SelectKBest - Selected features: Index(['aet_budyko_oudin', 'aet_budyko_hargreaves', 'aet_budyko_abtew',
       'aet_budyko_mcguinness_bordne'],
      dtype='object')


## RFE

In [6]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create and fit RFE
estimator = RandomForestRegressor()
selector = RFE(estimator, n_features_to_select=4, step=1)
X_new = selector.fit_transform(X_scaled, y)

# Get selected feature names
selected_features = X.columns[selector.support_]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)

print("RFE - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("RFE - Selected features:", selected_features)



RFE - Mean cross-validation score: 0.40
RFE - Selected features: Index(['Month', 'aet_budyko_oudin', 'aet_budyko_hargreaves', 'latitude'], dtype='object')


##  SelectFromModel

### tree-based models

In [7]:

from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Fit Random Forest
rf = RandomForestRegressor()
rf.fit(X, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select top 4 features
top_features = feature_importance['feature'][:4].tolist()
X_new = X[top_features]

# Evaluate
scores = cross_val_score(rf, X_new, y, cv=5)
print("Random Forest - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Random Forest - Selected features:", top_features)



Random Forest - Mean cross-validation score: 0.39
Random Forest - Selected features: ['aet_budyko_hargreaves', 'aet_budyko_oudin', 'Month', 'latitude']


### Lasso regularization

In [8]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import numpy as np

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': np.abs(lasso.coef_)})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select features with non-zero coefficients
selected_features = feature_importance[feature_importance['importance'] > 0]['feature'].tolist()
X_new = X[selected_features]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)
print("Lasso - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Lasso - Selected features:", selected_features)

Lasso - Mean cross-validation score: 0.24
Lasso - Selected features: ['aet_budyko_hargreaves', 'latitude', 'Elevation', 'aet_budyko_abtew', 'Month']


##  Sequential Feature Selection

In [9]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import pandas as pd



# Initialize the estimator
estimator = RandomForestRegressor()

# Create the SequentialFeatureSelector
# n_features_to_select: number of features to select
# direction: 'forward' for forward selection, 'backward' for backward selection
sfs = SequentialFeatureSelector(estimator, n_features_to_select=4, direction='forward')

# Fit the selector
sfs.fit(X, y)

# Get the selected feature names
selected_features = X.columns[sfs.get_support()].tolist()

# Create new feature matrix with only selected features
X_new = sfs.transform(X)

# Evaluate the performance
scores = cross_val_score(estimator, X_new, y, cv=5)
print("SequentialFeatureSelector - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SequentialFeatureSelector - Selected features:", selected_features)

SequentialFeatureSelector - Mean cross-validation score: 0.35
SequentialFeatureSelector - Selected features: ['Year', 'Month', 'latitude', 'Elevation']


# yearly data

## SelectKBest

In [12]:
# Assuming 'df' is your dataframe and 'target' is your target column
X = yearly_data.drop(['ET_fill', 'Site_ID', 'General_classification', 'Land_cover_details', 'Land_cover_type'], axis =1)
y = yearly_data['ET_fill']

# Select top k features
selector = SelectKBest(score_func=f_regression, k=4)
X_new = selector.fit_transform(X, y)


# Get selected feature names
selected_features = X.columns[selector.get_support()]


rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)


print("Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SelectKBest - Selected features:", selected_features)

Mean cross-validation score: -1.34
SelectKBest - Selected features: Index(['aet_budyko_oudin', 'aet_budyko_hargreaves', 'aet_budyko_abtew',
       'aet_budyko_mcguinness_bordne'],
      dtype='object')


## RFE

In [13]:


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create and fit RFE
estimator = RandomForestRegressor()
selector = RFE(estimator, n_features_to_select=4, step=1)
X_new = selector.fit_transform(X_scaled, y)

# Get selected feature names
selected_features = X.columns[selector.support_]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)

print("RFE - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("RFE - Selected features:", selected_features)

RFE - Mean cross-validation score: -0.53
RFE - Selected features: Index(['aet_budyko_oudin', 'aet_budyko_hargreaves',
       'aet_budyko_mcguinness_bordne', 'latitude'],
      dtype='object')


##  SelectFromModel

### tree-based models

In [14]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Fit Random Forest
rf = RandomForestRegressor()
rf.fit(X, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select top 4 features
top_features = feature_importance['feature'][:4].tolist()
X_new = X[top_features]

# Evaluate
scores = cross_val_score(rf, X_new, y, cv=5)
print("Random Forest - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Random Forest - Selected features:", top_features)

Random Forest - Mean cross-validation score: -0.16
Random Forest - Selected features: ['aet_budyko_mcguinness_bordne', 'latitude', 'aet_budyko_hargreaves', 'Elevation']


### Lasso regularization

In [15]:


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': np.abs(lasso.coef_)})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select features with non-zero coefficients
selected_features = feature_importance[feature_importance['importance'] > 0]['feature'].tolist()
X_new = X[selected_features]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)
print("Lasso - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Lasso - Selected features:", selected_features)

  model = cd_fast.enet_coordinate_descent(


Lasso - Mean cross-validation score: -0.20
Lasso - Selected features: ['aet_budyko_oudin', 'aet_budyko_mcguinness_bordne', 'aet_budyko_abtew', 'aet_budyko_hargreaves', 'latitude', 'Elevation', 'Year']


##  Sequential Feature Selection

In [16]:
# Initialize the estimator
estimator = RandomForestRegressor()

# Create the SequentialFeatureSelector
# n_features_to_select: number of features to select
# direction: 'forward' for forward selection, 'backward' for backward selection
sfs = SequentialFeatureSelector(estimator, n_features_to_select=4, direction='forward')

# Fit the selector
sfs.fit(X, y)

# Get the selected feature names
selected_features = X.columns[sfs.get_support()].tolist()

# Create new feature matrix with only selected features
X_new = sfs.transform(X)

# Evaluate the performance
scores = cross_val_score(estimator, X_new, y, cv=5)
print("SequentialFeatureSelector - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SequentialFeatureSelector - Selected features:", selected_features)

SequentialFeatureSelector - Mean cross-validation score: -0.07
SequentialFeatureSelector - Selected features: ['aet_budyko_hargreaves', 'aet_budyko_abtew', 'latitude', 'Elevation']


# decade data

## SelectKBest

In [18]:
# Assuming 'df' is your dataframe and 'target' is your target column
X = decade_data.drop(['ET_fill', 'Site_ID', 'General_classification', 'Land_cover_details', 'Land_cover_type'], axis =1)
y = decade_data['ET_fill']

# Select top k features
selector = SelectKBest(score_func=f_regression, k=4)
X_new = selector.fit_transform(X, y)


# Get selected feature names
selected_features = X.columns[selector.get_support()]


rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)


print("Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SelectKBest - Selected features:", selected_features)

Mean cross-validation score: 0.03
SelectKBest - Selected features: Index(['aet_budyko_oudin', 'aet_budyko_hargreaves', 'aet_budyko_abtew',
       'aet_budyko_mcguinness_bordne'],
      dtype='object')


## RFE

In [19]:


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create and fit RFE
estimator = RandomForestRegressor()
selector = RFE(estimator, n_features_to_select=4, step=1)
X_new = selector.fit_transform(X_scaled, y)

# Get selected feature names
selected_features = X.columns[selector.support_]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)

print("RFE - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("RFE - Selected features:", selected_features)

RFE - Mean cross-validation score: 0.15
RFE - Selected features: Index(['Month', 'aet_budyko_oudin', 'aet_budyko_hargreaves', 'latitude'], dtype='object')


##  SelectFromModel

### tree-based models

In [20]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Fit Random Forest
rf = RandomForestRegressor()
rf.fit(X, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select top 4 features
top_features = feature_importance['feature'][:4].tolist()
X_new = X[top_features]

# Evaluate
scores = cross_val_score(rf, X_new, y, cv=5)
print("Random Forest - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Random Forest - Selected features:", top_features)

Random Forest - Mean cross-validation score: 0.15
Random Forest - Selected features: ['aet_budyko_hargreaves', 'aet_budyko_oudin', 'latitude', 'Month']


### Lasso regularization

In [21]:


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

# Get feature importances
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': np.abs(lasso.coef_)})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select features with non-zero coefficients
selected_features = feature_importance[feature_importance['importance'] > 0]['feature'].tolist()
X_new = X[selected_features]

# Evaluate
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_new, y, cv=5)
print("Lasso - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("Lasso - Selected features:", selected_features)

Lasso - Mean cross-validation score: 0.26
Lasso - Selected features: ['aet_budyko_hargreaves', 'latitude', 'Elevation', 'Month']


##  Sequential Feature Selection

In [None]:
# Initialize the estimator
estimator = RandomForestRegressor()

# Create the SequentialFeatureSelector
# n_features_to_select: number of features to select
# direction: 'forward' for forward selection, 'backward' for backward selection
sfs = SequentialFeatureSelector(estimator, n_features_to_select=4, direction='forward')

# Fit the selector
sfs.fit(X, y)

# Get the selected feature names
selected_features = X.columns[sfs.get_support()].tolist()

# Create new feature matrix with only selected features
X_new = sfs.transform(X)

# Evaluate the performance
scores = cross_val_score(estimator, X_new, y, cv=5)
print("SequentialFeatureSelector - Mean cross-validation score: {:.2f}".format(scores.mean()))
print("SequentialFeatureSelector - Selected features:", selected_features)