In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from scipy import signal
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import math

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
TOPIC_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", TOPIC_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### Import libraries

In [None]:
# ML models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

# Metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### Cleaning the data

In [None]:
data = pd.read_csv('new_data.csv') 
new_mdb = data[[ "CMRSET.ET", "RAD", "LST", "EVI", "NDVI", "SM", "LAI", "LAT", "LON", "Field.ET"]]
new_mdb = new_mdb.sample(frac=1) # Shuffle the data

X =  new_mdb.iloc[:, :-1].values # Variables used in the model to determine the predictor
y = new_mdb.iloc[:, -1].values # Predictor: 'residual'
LENGTH = len(new_mdb)
df_leave_one_out = pd.DataFrame(columns=['ML Algorithm', 'MSE', 'RMSE', 'R^2'])

### Decision Tree Cross Validation on Predicted.ET

In [None]:
tree = DecisionTreeRegressor(max_depth=5, random_state=42) 

# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    tree.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = tree.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'Decision Tree', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out


Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72


### Random Forest Predicted ET directly GMM

In [5]:
rnd_clf = RandomForestRegressor(n_estimators=300, min_samples_split = 7, random_state=42) # Similar results, less trees

# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    rnd_clf.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = rnd_clf.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'Random Forest', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out


Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72
1,Random Forest,63.53,7.97,0.84


### Gradient Boosting

In [6]:
# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up Gradient Boosting parameters
gb_clf = GradientBoostingRegressor(max_depth=5,
                                   min_samples_split=7,
                                   n_estimators=200,
                                   learning_rate=0.1,
                                   random_state=42)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    gb_clf.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = gb_clf.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'Gradient Boosting', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72
1,Random Forest,63.53,7.97,0.84
2,Gradient Boosting,60.5,7.78,0.84


### Ada Boosting

In [7]:
ada_clf = AdaBoostRegressor(learning_rate = 0.1, n_estimators=200, random_state=42)

# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    ada_clf.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = ada_clf.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'Ada Boosting', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72
1,Random Forest,63.53,7.97,0.84
2,Gradient Boosting,60.5,7.78,0.84
3,Ada Boosting,80.79,8.99,0.79


### Bagging Regressor

In [8]:
bag_reg = BaggingRegressor(n_estimators=300, random_state=42)
# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    bag_reg.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = bag_reg.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'Bagging Regressor', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72
1,Random Forest,63.53,7.97,0.84
2,Gradient Boosting,60.5,7.78,0.84
3,Ada Boosting,80.79,8.99,0.79
4,Bagging Regressor,64.84,8.05,0.83


### SVM Regression

In [None]:
svm_regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.5, kernel = "sigmoid"))

# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    svm_regr.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = svm_regr.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'SVM Regressor', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72
1,Random Forest,63.53,7.97,0.84
2,Gradient Boosting,60.5,7.78,0.84
3,Ada Boosting,80.79,8.99,0.79
4,Bagging Regressor,64.84,8.05,0.83
5,SVM Regressor,227.35,15.08,0.41


### Stacking

In [10]:
estimators = [
     ('gb', GradientBoostingRegressor(max_depth=5,
                                   min_samples_split = 7,
                                   n_estimators=300,
                                   learning_rate=0.1,
                                   random_state=42)),
     ('rf', RandomForestRegressor(n_estimators=300, min_samples_split = 7, random_state=42)),
     ('bg', BaggingRegressor(n_estimators=300, random_state=42))

]

stack_reg = StackingRegressor(
     estimators=estimators
 )


# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    stack_reg.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = stack_reg.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'Stacking', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72
1,Random Forest,63.53,7.97,0.84
2,Gradient Boosting,60.5,7.78,0.84
3,Ada Boosting,80.79,8.99,0.79
4,Bagging Regressor,64.84,8.05,0.83
5,SVM Regressor,227.35,15.08,0.41
6,Stacking,126.43,11.24,0.67


### Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    lin_reg.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = lin_reg.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'Linear Regression', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72
1,Random Forest,63.53,7.97,0.84
2,Gradient Boosting,60.5,7.78,0.84
3,Ada Boosting,80.79,8.99,0.79
4,Bagging Regressor,64.84,8.05,0.83
5,SVM Regressor,227.35,15.08,0.41
6,Stacking,126.43,11.24,0.67
7,Linear Regression,147.6,12.15,0.62


### Majority voting regressor

In [12]:
from sklearn.ensemble import VotingRegressor
rnd_clf = RandomForestRegressor(n_estimators=300, min_samples_split = 7, random_state=42)
gb_clf = GradientBoostingRegressor(max_depth=5,
                                   min_samples_split = 7,
                                   n_estimators=300,
                                   learning_rate=0.1,
                                   random_state=42)
bag_reg = BaggingRegressor(n_estimators=300, random_state=42)
# tree = DecisionTreeRegressor(max_depth=5, random_state=42) 
# lin_reg = LinearRegression()

er = VotingRegressor([('rf', rnd_clf), ('gb', gb_clf), ('bg', bag_reg)]) # ('tree', tree), ('lin_reg', lin_reg)

# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    er.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = er.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'Majority Voting Regressor', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72
1,Random Forest,63.53,7.97,0.84
2,Gradient Boosting,60.5,7.78,0.84
3,Ada Boosting,80.79,8.99,0.79
4,Bagging Regressor,64.84,8.05,0.83
5,SVM Regressor,227.35,15.08,0.41
6,Stacking,126.43,11.24,0.67
7,Linear Regression,147.6,12.15,0.62
8,Majority Voting Regressor,64.04,8.0,0.83


### KNN

In [None]:
knn_reg = KNeighborsRegressor(n_neighbors = 5) 
# Set up GMM parameters
k = 2
gmm = GaussianMixture(n_components=k)

# Set up K-Fold cross validation
cv = KFold(n_splits=len(X), shuffle=False)

# Set up an empty DataFrame to store results
results = pd.DataFrame(columns=['predicted', 'actual'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Iterate over the cross validation splits
for train_idx, test_idx in cv.split(X):
    # Split the data into training and test sets
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit the GMM model to the training data and make predictions for the test data
    gmm.fit(X_train)
    test_cluster = gmm.predict(X_test.reshape(1, -1))[0]
    
    # Get the training data corresponding to the closest cluster
    cluster_X = X_train[gmm.predict(X_train) == test_cluster]
    cluster_y = y_train[gmm.predict(X_train) == test_cluster]
    
    # Fit the Gradient Boosting model to the training data for the selected cluster
    knn_reg.fit(cluster_X, cluster_y)
    
    # Make predictions on the test data
    y_pred = knn_reg.predict(X_test.reshape(1, -1))[0]
    
    # Store the predicted and actual values in the results DataFrame
    results = results.append({'predicted': y_pred, 'actual': y_test[0]}, ignore_index=True)

# Calculate the R-squared score for the predictions
r2 = r2_score(results['actual'], results['predicted'])
mse = mean_squared_error(results['actual'], results['predicted'])
d = [f'KNN Regressor', round(mse, 2),  round((mse)**(1/2.0), 2), round(r2_score(results['actual'], results['predicted']), 2) ] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d

df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,109.46,10.46,0.72
1,Random Forest,63.53,7.97,0.84
2,Gradient Boosting,60.5,7.78,0.84
3,Ada Boosting,80.79,8.99,0.79
4,Bagging Regressor,64.84,8.05,0.83
5,SVM Regressor,227.35,15.08,0.41
6,Stacking,126.43,11.24,0.67
7,Linear Regression,147.6,12.15,0.62
8,Majority Voting Regressor,64.04,8.0,0.83
9,KNN Regressor,113.42,10.65,0.71


In [None]:
rslt_df = df_leave_one_out.sort_values(by = 'MSE')
rslt_df

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
2,Gradient Boosting,60.5,7.78,0.84
1,Random Forest,63.53,7.97,0.84
8,Majority Voting Regressor,64.04,8.0,0.83
4,Bagging Regressor,64.84,8.05,0.83
3,Ada Boosting,80.79,8.99,0.79
0,Decision Tree,109.46,10.46,0.72
9,KNN Regressor,113.42,10.65,0.71
6,Stacking,126.43,11.24,0.67
7,Linear Regression,147.6,12.15,0.62
5,SVM Regressor,227.35,15.08,0.41
