In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE

In [10]:
df=pd.read_csv(r"C:\Users\neevb\OneDrive\Desktop\new_data_final_year\analysis\Data_ETo_PM.csv")

In [11]:
df['TMEAN']=(df['TMIN']+df['TMAX'])/2

In [12]:
# df.isna().sum()

In [13]:
# df.columns

In [14]:
# df.isna().sum()

In [15]:
# df.describe()

<!--  -->

In [16]:
# df.columns

In [17]:
X = df.drop(['ET0_PM','DATE'], axis=1)
y = df["ET0_PM"]

In [18]:
# # scaler = StandardScaler()
# # X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.3,shuffle=False)

<!-- Linear: -->

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)
mae1 = mean_absolute_error(y_test, y_pred1)
mse1 = mean_squared_error(y_test, y_pred1)
r21 = r2_score(y_test, y_pred1)
print("Mean Squared Error: ", mse1)
print("Mean Absolute Error: ", mae1)
print("R-2 score: ", r21)

Mean Squared Error:  4.7956048794614905e-06
Mean Absolute Error:  0.0014864514073429
R-2 score:  0.9999979229222028


<!-- Ridge: -->

In [20]:
model2 = Ridge(alpha=1.0)  # Adjust alpha as needed
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)
mae2=mean_absolute_error(y_test,y_pred2)
mse2=mean_squared_error(y_test,y_pred2)
r22=r2_score(y_test,y_pred2)
print("Mean Squared Error: ",mse2)
print("Mean Absolute Error: ",mae2)
print("R-2 score: ",r22)

Mean Squared Error:  4.703112541476051e-06
Mean Absolute Error:  0.0014825400167915053
R-2 score:  0.9999979629825886


<!-- Lasso: -->

In [21]:
model3=Lasso(alpha=0.01)
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)
mae3=mean_absolute_error(y_test,y_pred3)
mse3=mean_squared_error(y_test,y_pred3)
r23=r2_score(y_test,y_pred3)
print("Mean Squared Error: ",mse3)
print("Mean Absolute Error: ",mae3)
print("R-2 score: ",r23)

Mean Squared Error:  0.00030235926545812953
Mean Absolute Error:  0.013774753160955517
R-2 score:  0.999869041813734


In [19]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Decision Tree Regressor
model = DecisionTreeRegressor()

# Create a Grid Search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the Grid Search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print(grid_search.best_params_)
print(grid_search.best_score_)

# Use the best parameters to train the final model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the final model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared Score:", r2)

{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}
0.9978147851297912
Mean Squared Error: 0.004344090720673638
Mean Absolute Error: 0.04556318698827701
R-squared Score: 0.9981184825247802


<!-- Decision Tree: -->

In [18]:
model4 = DecisionTreeRegressor(
    criterion='squared_error',  # You can also use 'mae'
    max_depth=10,  # Adjust the maximum depth of the tree
    min_samples_split=5,  # Minimum samples required to split a node
    min_samples_leaf=2  # Minimum samples required in a leaf node   

)
model4.fit(X_train, y_train)
y_pred4 = model4.predict(X_test)
mae4=mean_absolute_error(y_test,y_pred4)
mse4=mean_squared_error(y_test,y_pred4)
r24=r2_score(y_test,y_pred4)
print("Mean Squared Error: ",mse4)
print("Mean Absolute Error: ",mae4)
print("R-2 score: ",r24)

Mean Squared Error:  0.004330952850134008
Mean Absolute Error:  0.04542600164036401
R-2 score:  0.9981241728140944


In [21]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distribution
param_dist = {
    'n_estimators': randint(low=50, high=200),
    'max_depth': randint(low=1, high=10),
    'min_samples_split': randint(low=2, high=10),
    'min_samples_leaf': randint(low=1, high=5)
}

# Create a Random Forest Regressor
rf = RandomForestRegressor()

# Create a Randomized Search object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1)

# Fit the Randomized Search to the training data
random_search.fit(X_train, y_train)

# Print the best parameters and score
print(random_search.best_params_)
print(random_search.best_score_)

{'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 148}
0.99905460913678


In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
rf = RandomForestRegressor(random_state=42)

# Define a comprehensive parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 500],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for split
    'bootstrap': [True, False]  # Whether bootstrap samples are used
}

# Initialize GridSearchCV (exhaustive search over all combinations)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Use MSE for regression
    cv=5,  # 5-fold cross-validation
    verbose=2,  # Show progress
    n_jobs=-1  # Use all available cores for parallel processing
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)


Fitting 5 folds for each of 864 candidates, totalling 4320 fits


1440 fits failed out of a total of 4320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
461 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\neevb\anaconda3\envs\evdp\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\neevb\anaconda3\envs\evdp\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\neevb\anaconda3\envs\evdp\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\neevb\anaconda3\envs\evdp\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_par

Best Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [19]:
model5 =RandomForestRegressor(
bootstrap=False, max_depth=None,max_features='sqrt',min_samples_leaf=1,min_samples_split=2,n_estimators=200
)

# Train the model on the training data
model5.fit(X_train, y_train)

# Make predictions on the test data
y_pred5 = model5.predict(X_test)

# Evaluate model performance
mae5 = mean_absolute_error(y_test, y_pred5)
mse5 = mean_squared_error(y_test, y_pred5)
r25 = r2_score(y_test, y_pred5)

# Print evaluation metrics
print("Mean Squared Error: ", mse5)
print("Mean Absolute Error: ", mae5)
print("R-2 score: ", r25)

Mean Squared Error:  0.0017373839798168163
Mean Absolute Error:  0.02476083083677339
R-2 score:  0.9992475022900338


In [23]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4] 
}

# Create a Gradient Boosting Regressor
gbm = GradientBoostingRegressor()

# Create a Grid Search object
grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the Grid Search to the training data
grid_search.fit(X_train, y_train)

# Print the best   
# parameters and score
print(grid_search.best_params_)
print(grid_search.best_score_)


{'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
0.999689241466552


In [10]:
model6 = GradientBoostingRegressor(
   learning_rate=0.1,max_depth=5,min_samples_leaf=1,min_samples_split=5,n_estimators=300
)

# Train the model on the training data
model6.fit(X_train, y_train)

# Make predictions on the test data
y_pred6 = model6.predict(X_test)

# Evaluate
mae6 = mean_absolute_error(y_test, y_pred6)
mse6 = mean_squared_error(y_test, y_pred6)
r26 = r2_score(y_test, y_pred6)

# Print evaluation metrics
print("Mean Squared Error:", mse6)
print("Mean Absolute Error:", mae6)
print("R-squared Score:", r26)

Mean Squared Error: 0.0008385553931361603
Mean Absolute Error: 0.021934917644513538
R-squared Score: 0.9996368039418199


In [25]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear',
 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'epsilon': [0.1, 0.2, 0.3]
}

# Create an SVR model
svr = SVR()

# Create a Grid Search object
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the Grid Search to the training data
grid_search.fit(X_train, y_train)

# Print the best   
#  parameters and score
print(grid_search.best_params_)
print(grid_search.best_score_)

{'C': 0.1, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
0.9996333450791044


In [11]:
model7 = SVR(kernel='linear', C=0.1, epsilon=0.1, gamma='scale')

# Train the model on the training data
model7.fit(X_train, y_train)

# Make predictions on the test data
y_pred7 = model7.predict(X_test)

# Evaluate model performance
mae7 = mean_absolute_error(y_test, y_pred7)
mse7 = mean_squared_error(y_test, y_pred7)
r27 = r2_score(y_test, y_pred7)

# Print evaluation metrics
print("Mean Squared Error:", mse7)
print("Mean Absolute Error:", mae7)
print("R-squared Score:", r27)

Mean Squared Error: 0.0010284989312415898
Mean Absolute Error: 0.02602499535838756
R-squared Score: 0.9995545353822455


In [9]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Patch XGBRegressor to work with Scikit-Learn 2.1.1
def _sklearn_tags(self):
    return {"non_deterministic": True}

XGBRegressor.__sklearn_tags__ = _sklearn_tags

# Initialize XGBRegressor
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0.01, 0.1, 1, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  
    cv=5,  
    verbose=2,
    n_jobs=-1  
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Predict on test set
y_pred = best_model.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)


Fitting 5 folds for each of 36864 candidates, totalling 184320 fits


4 fits failed out of a total of 184320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\neevb\anaconda3\envs\evdp\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\neevb\anaconda3\envs\evdp\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "c:\Users\neevb\anaconda3\envs\evdp\lib\site-packages\xgboost\sklearn.py", line 1081, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\neevb\anaconda3\envs\evdp\lib\site-packages\xgboost\sklearn.py", line 596, in _wrap_evaluation_matrices
   

Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 500, 'reg_alpha': 0.01, 'reg_lambda': 0.01, 'subsample': 1.0}
Test MSE: 0.001046110048142537


In [11]:
# model8 = from xgboost import XGBRegressor

model8 = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    gamma=0,
    max_depth=5,
    reg_alpha=0.01,
    reg_lambda=0.01,
    colsample_bytree=1.0,
    subsample=1.0
)


# Train the model on the training data
model8.fit(X_train, y_train)

# Make predictions on the test data
y_pred8 = model8.predict(X_test)

# Evaluate   
#  model performance
mae8 = mean_absolute_error(y_test, y_pred8)
mse8 = mean_squared_error(y_test, y_pred8) 

r28 = r2_score(y_test, y_pred8)

# Print evaluation metrics
print("Mean Squared Error:", mse8)
print("Mean Absolute Error:", mae8)
print("R-squared Score:", r28)

Mean Squared Error: 0.001046110048142537
Mean Absolute Error: 0.01848139808308132
R-squared Score: 0.9995469076354193


In [30]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
param_grid = {
    'alpha': [0.0001, 0.01, 0.1, 1.0, 10.0],  # Explore a wider range of alpha values
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]  # Explore a balanced range of l1_ratio values
}

# Create the ElasticNet model
elastic_net = ElasticNet(random_state=42)  # Set random_state for reproducibility

# Perform Grid Search to find the best hyperparameters
grid_search = GridSearchCV(elastic_net, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X_train,y_train)

# Extract the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'alpha': 0.0001, 'l1_ratio': 0.9}


In [22]:
from sklearn.linear_model import ElasticNet
# Define the ElasticNet model with hyperparameters
model9 = ElasticNet(alpha=0.0001, l1_ratio=0.9)  # Adjust alpha and l1_ratio as needed

# Train the model on the training data
model9.fit(X_train, y_train)

# Make predictions on the test data
y_pred9 = model9.predict(X_test)

# Evaluate model performance
mae9 = mean_absolute_error(y_test, y_pred9)
mse9 = mean_squared_error(y_test, y_pred9)
r29 = r2_score(y_test, y_pred9)

# Print evaluation metrics
print("Mean Squared Error:", mse9)
print("Mean Absolute Error:", mae9)
print("R-squared Score:", r29)

Mean Squared Error: 4.6159491820061995e-06
Mean Absolute Error: 0.0014727844667903542
R-squared Score: 0.9999980007348812


In [24]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['relu'],  # Experiment with other activations
    'solver': ['adam'],  # Consider other solvers (sgd, lbfgs)
    'alpha': [0.001, 0.01],
    'learning_rate_init': [0.001, 0.01],
    'early_stopping': [True],  # Enable early stopping
    'validation_fraction': [0.1],  # Fraction of training data to set aside for validation
    'n_iter_no_change': [5],  # Number of iterations with no change to wait for early stopping
}

# Create the MLPRegressor model
mlp = MLPRegressor(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Output the best parameters
print("Best parameters found: ", best_params)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found:  {'activation': 'relu', 'alpha': 0.01, 'early_stopping': True, 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.001, 'n_iter_no_change': 5, 'solver': 'adam', 'validation_fraction': 0.1}


In [25]:

model10 = MLPRegressor(hidden_layer_sizes=(100,),activation='relu', alpha= 0.01,early_stopping=True,learning_rate_init= 0.001,n_iter_no_change=5,solver='adam',validation_fraction= 0.1)

# Train the model on the training data
model10.fit(X_train, y_train)

# Make predictions on the test data
y_pred10 = model10.predict(X_test)

# Evaluate model performance
mae10 = mean_absolute_error(y_test, y_pred10)
mse10 = mean_squared_error(y_test, y_pred10)
r210 = r2_score(y_test, y_pred10)

# Print evaluation metrics
print("Mean Squared Error:", mse10)
print("Mean Absolute Error:", mae10)
print("R-squared Score:", r210)

Mean Squared Error: 0.0021528462615903227
Mean Absolute Error: 0.03290806170090736
R-squared Score: 0.9990675567977053


In [18]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_iter': [100, 200, 300],
}

# Create a HistGradientBoostingRegressor
model = HistGradientBoostingRegressor()

# Create a Grid Search object
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)

# Fit the Grid Search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print(grid_search.best_params_)



{'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 300}


In [19]:
model = HistGradientBoostingRegressor(max_depth=5, learning_rate=0.1, max_iter=100)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared Score:", r2) 

Mean Squared Error: 0.0011570307853411995
Mean Absolute Error: 0.02216060678621396
R-squared Score: 0.9994988655205503


<!--  -->