In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from scipy import signal
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import math

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
TOPIC_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", TOPIC_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### Import libraries

In [None]:
# ML models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

# Metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### Cleaning the data

In [None]:
data = pd.read_csv('new_data.csv') 
new_mdb = data[[ "CMRSET.ET", "RAD", "LST", "EVI", "NDVI", "SM", "LAI", "Field.ET"]]
new_mdb = new_mdb.sample(frac=1) # Shuffle the data

X =  new_mdb.iloc[:, :-1].values # Variables used in the model to determine the predictor
y = new_mdb.iloc[:, -1].values # Predictor: 'residual'
LENGTH = len(new_mdb)
df_leave_one_out = pd.DataFrame(columns=['ML Algorithm', 'MSE', 'RMSE', 'R^2'])

### Decision Tree

In [None]:
tree = DecisionTreeRegressor(max_depth=5, random_state=42) 
tree.fit(X, y)
y_pred_rf = tree.predict(X) # Predictions on test set
r2 = r2_score(y, y_pred_rf) 

mse = mean_squared_error(y, y_pred_rf)
scores = cross_val_score(tree, X, y, cv=LENGTH, scoring='neg_mean_squared_error')

d = [f'Decision Tree', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), scores_r2] # round(scores_r2.mean(), 2)
df_leave_one_out.loc[len(df_leave_one_out)] = d
df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,168.81,12.99,


### Random Forest

In [None]:
rnd_clf = RandomForestRegressor(n_estimators=300, min_samples_split = 7, random_state=42) # Similar results, less trees
rnd_clf.fit(X, y)
scores = cross_val_score(rnd_clf, X, y, cv=LENGTH, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(rnd_clf, X, y, cv=LENGTH, scoring='r2')
d = [f'Random Forest', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), scores_r2]
df_leave_one_out.loc[len(df_leave_one_out)] = d

### Gradient Boosting

In [None]:
gb_clf = GradientBoostingRegressor(max_depth=5,
                                   min_samples_split = 7,
                                   n_estimators=200,
                                   learning_rate=0.1,
                                   random_state=42) # Similar results, less trees
gb_clf.fit(X, y) 
scores = cross_val_score(gb_clf, X, y, cv=LENGTH, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(gb_clf, X, y, cv=LENGTH, scoring='r2')
d = [f'Gradient Boosting', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), scores_r2]
df_leave_one_out.loc[len(df_leave_one_out)] = d
df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Gradient Boosting,106.55,10.32,
1,Gradient Boosting,105.9,10.29,


### Ada Boosting

In [None]:
ada_clf = AdaBoostRegressor(learning_rate = 0.1, n_estimators=200, random_state=42)
scores = cross_val_score(ada_clf, X, y, cv=LENGTH, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(ada_clf, X, y, cv=LENGTH, scoring='r2')
d = ['Ada Boosting', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), scores_r2]
df_leave_one_out.loc[len(df_leave_one_out)] = d
df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,168.81,12.99,
1,Random Forest,107.19,10.35,Nan
2,Gradient Boosting,106.54,10.32,
3,Ada Boosting,125.59,11.21,Nan


### Bagging Regressor

In [None]:
bag_reg = BaggingRegressor(n_estimators=300, random_state=42)
scores = cross_val_score(bag_reg, X, y, cv=LENGTH, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(bag_reg, X, y, cv=10, scoring='r2')
d = ['Bagging Regressor', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), scores_r2]
df_leave_one_out.loc[len(df_leave_one_out)] = d
df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,168.81,12.99,
1,Random Forest,107.19,10.35,Nan
2,Gradient Boosting,106.54,10.32,
3,Ada Boosting,125.59,11.21,Nan
4,Bagging Regressor,105.95,10.29,Nan


### SVM Regression

In [None]:
svm_regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.5, kernel = "sigmoid"))
scores = cross_val_score(svm_regr, X, y, cv=10, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(svm_regr, X, y, cv=10, scoring='r2')
d = ['SVM Regressor', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), scores_r2]
df_leave_one_out.loc[len(df_leave_one_out)] = d
df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,168.81,12.99,
1,Random Forest,107.19,10.35,Nan
2,Gradient Boosting,106.54,10.32,
3,Ada Boosting,125.59,11.21,Nan
4,Bagging Regressor,105.95,10.29,Nan
5,Voting Ensemle,102.93,10.15,
6,Linear Regression,177.66,13.33,
7,SVM Regressor,316.72,17.8,
8,SVM Regressor,316.72,17.8,


### Stacking

In [None]:
estimators = [
     ('gb', GradientBoostingRegressor(max_depth=5,
                                   min_samples_split = 7,
                                   n_estimators=300,
                                   learning_rate=0.1,
                                   random_state=42)),
     ('rf', RandomForestRegressor(n_estimators=300, min_samples_split = 7, random_state=42)),
     ('bg', BaggingRegressor(n_estimators=300, random_state=42))

]
stack_reg = StackingRegressor(
     estimators=estimators
 )

scores = cross_val_score(stack_reg, X, y, cv=LENGTH, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(stack_reg, X, y, cv=10, scoring='r2')
d = ['Stacking', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), 'NaN']
df_leave_one_out.loc[len(df_leave_one_out)] = d
df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,168.81,12.99,
1,Random Forest,107.19,10.35,Nan
2,Gradient Boosting,106.54,10.32,
3,Ada Boosting,125.59,11.21,Nan
4,Bagging Regressor,105.95,10.29,Nan
5,Voting Ensemle,102.93,10.15,
6,Linear Regression,177.66,13.33,
7,SVM Regressor,316.72,17.8,
8,KNN Regressor,142.15,11.92,
9,Stacking,124.74,11.17,


### Majority voting regressor

In [None]:
from sklearn.ensemble import VotingRegressor
rnd_clf = RandomForestRegressor(n_estimators=300, min_samples_split = 7, random_state=42)
gb_clf = GradientBoostingRegressor(max_depth=5,
                                   min_samples_split = 7,
                                   n_estimators=300,
                                   learning_rate=0.1,
                                   random_state=42)
bag_reg = BaggingRegressor(n_estimators=300, random_state=42)

er = VotingRegressor([('rf', rnd_clf), ('gb', gb_clf), ('bg', bag_reg)]) # ('tree', tree), ('lin_reg', lin_reg)

scores = cross_val_score(er, X, y, cv=LENGTH, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(er, X, y, cv=10, scoring='r2')
d = ['Voting Ensemle', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), scores_r2]
df_leave_one_out.loc[len(df_leave_one_out)] = d
df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,168.81,12.99,
1,Random Forest,107.19,10.35,Nan
2,Gradient Boosting,106.54,10.32,
3,Ada Boosting,125.59,11.21,Nan
4,Bagging Regressor,105.95,10.29,Nan
5,Voting Ensemle,102.93,10.15,


## Linear regression

In [None]:
lin_reg = LinearRegression()
scores = cross_val_score(lin_reg, X, y, cv=LENGTH, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(lin_reg, X, y, cv=10, scoring='r2')
print("\nCross validation:")
d = ['Linear Regression', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), scores_r2]
df_leave_one_out.loc[len(df_leave_one_out)] = d
df_leave_one_out

### KNN

In [None]:

scaler = MinMaxScaler()#creating an object
scaler.fit(X)#calculate min and max value of the training data
X_norm = scaler.transform(X)
knn_reg = KNeighborsRegressor(n_neighbors = 5) 
scores = cross_val_score(knn_reg, X_norm, y, cv=LENGTH, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(knn_reg, X_norm, y, cv=10, scoring='r2')
d = ['KNN Regressor', round(scores.mean()*-1, 2),  round((scores.mean()*-1)**(1/2.0), 2), scores_r2]
df_leave_one_out.loc[len(df_leave_one_out)] = d
df_leave_one_out

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
0,Decision Tree,168.81,12.99,
1,Random Forest,107.19,10.35,Nan
2,Gradient Boosting,106.54,10.32,
3,Ada Boosting,125.59,11.21,Nan
4,Bagging Regressor,105.95,10.29,Nan
5,Voting Ensemle,102.93,10.15,
6,Linear Regression,177.66,13.33,
7,SVM Regressor,316.72,17.8,
8,KNN Regressor,142.15,11.92,


In [None]:
rslt_df = df_leave_one_out.sort_values(by = 'MSE')
rslt_df

Unnamed: 0,ML Algorithm,MSE,RMSE,R^2
5,Voting Ensemle,102.93,10.15,
4,Bagging Regressor,105.95,10.29,Nan
2,Gradient Boosting,106.54,10.32,
1,Random Forest,107.19,10.35,Nan
9,Stacking,124.74,11.17,
3,Ada Boosting,125.59,11.21,Nan
8,KNN Regressor,142.15,11.92,
0,Decision Tree,168.81,12.99,
6,Linear Regression,177.66,13.33,
7,SVM Regressor,316.72,17.8,
