# GRID SEARCH ALL MODELS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import calendar
import statistics
from sklearn import metrics

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from joblib import dump, load
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import os
import warnings
import time
start_time = time.time()

In [6]:
data = pd.read_csv('dataset/processed_data.csv')
X = data.drop(['D mm'], axis = 1).values
y = data['D mm'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, stratify=None)


## 1. ANN GRID SEARCH HYPERPARAMETER

In [7]:
from sklearn.neural_network import MLPRegressor

In [8]:
# Initialize the Random Forest model
mlp_model = MLPRegressor()
param_mlp = {
    'hidden_layer_sizes' : [2, 8, 16, 128],
    'activation' : ['logistic', 'tanh', 'relu'],
    'solver' : ['adam', 'sgd'],
    'batch_size': [125,250,500],
    'learning_rate':['constant', 'adaptive'],
    'max_iter' : [2000]
}

# Create the Grid Search model with the Random Forest model and hyperparameter grid
grid_search_ANN = GridSearchCV(estimator=mlp_model, param_grid=param_mlp, cv=5, scoring='r2', n_jobs=-1)

# Train the model on the training data
grid_search_ANN.fit(X_train, y_train)

In [9]:
result_ANN = pd.DataFrame(grid_search_ANN.cv_results_)
result_ANN = result_ANN.sort_values(by='rank_test_score')
result_ANN

In [10]:
best_model_ANN = grid_search_ANN.best_estimator_
print("Grid search - best param: ", grid_search_ANN.best_params_)
print("Grid search - best score: ", grid_search_ANN.best_score_)
dump(best_model_ANN, 'model/ann.joblib') 

Grid search - best param:  {'activation': 'relu', 'batch_size': 250, 'hidden_layer_sizes': 8, 'learning_rate': 'constant', 'max_iter': 2000, 'solver': 'adam'}
Grid search - best score:  0.899675007873404


['model/ann.joblib']

In [11]:
y_pred_ANN = best_model_ANN.predict(X_test)

r2_ANN = r2_score(y_test, y_pred_ANN)
print(f"R-squared (R2) Score: {r2_ANN}")

rmse_ANN = np.sqrt(mean_squared_error(y_test, y_pred_ANN))
print("RMSE: ", rmse_ANN)
print("end time: ", time.time() - start_time)

R-squared (R2) Score: 0.8610641622882163
RMSE:  0.10145414497473079
end time:  82.34773802757263


## 2. RANDOM FOREST GRID SEARCH

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
# Initialize the Random Forest model
rf_model = RandomForestRegressor()
param_rf = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'random_state': [42,10,100],
    'criterion':['squared_error', 'friedman_mse']
}

# Create the Grid Search model with the Random Forest model and hyperparameter grid
grid_search_RF = GridSearchCV(estimator=rf_model, param_grid=param_rf, cv=5, scoring='r2', n_jobs=-1)

# Train the model on the training data
grid_search_RF.fit(X_train, y_train)



KeyboardInterrupt: 

In [None]:
result_RF = pd.DataFrame(grid_search_RF.cv_results_)
result_RF = result_RF.sort_values(by='rank_test_score')
result_RF

In [None]:
best_model_RF = grid_search_RF.best_estimator_
print("Grid search - best param: ", grid_search_RF.best_params_)
print("Grid search - best score: ", grid_search_RF.best_score_)
dump(best_model_RF, 'model/random_forest_linear.joblib') 

In [None]:
y_pred_RF = best_model_RF.predict(X_test)

r2_RF = r2_score(y_test, y_pred_RF)
print(f"R-squared (R2) Score: {r2_RF}")

rmse_RF = np.sqrt(mean_squared_error(y_test, y_pred_RF))
print("RMSE: ", rmse_RF)
print("end time: ", time.time() - start_time)

## 3.SUPER VECTOR REGRESSION GRID SEARCH

In [None]:
from sklearn.svm import SVR

In [None]:
# Initialize the Support Vector Regression model
svr_model = SVR()
param_svr = {
    'kernel' : ['rbf', 'sigmoid'],
    'gamma' : ['scale', 'auto'],
    'C' : [0.5,1,1.5,2.0],
    'epsilon': [0.001,0.01, 0.1, 0.5]
}

# Create the Grid Search model with the Random Forest model and hyperparameter grid
grid_search_SVR = GridSearchCV(estimator=svr_model, param_grid=param_svr, cv=5, scoring='r2', n_jobs=-1)

# Train the model on the training data
grid_search_SVR.fit(X_train, y_train)

In [None]:
result_SVR = pd.DataFrame(grid_search_SVR.cv_results_)
result_SVR = result_SVR.sort_values(by='rank_test_score')
result_SVR

In [None]:
best_model_SVR = grid_search_SVR.best_estimator_
print("Grid search - best param: ", grid_search_SVR.best_params_)
print("Grid search - best score: ", grid_search_SVR.best_score_)

dump(best_model_SVR, 'model/svr.joblib') 

In [None]:
y_pred_SVR = best_model_SVR.predict(X_test)

r2_SVR = r2_score(y_test, y_pred_SVR)
print(f"R-squared (R2) Score: {r2}")

rmse_SVR = np.sqrt(mean_squared_error(y_test, y_pred_SVR))
print("RMSE: ", rmse_SVR)
print("end time: ", time.time() - start_time)

---
# TEST MODEL

In [None]:
duration = 3 * 24 * 30
splitThreshold = -6000
X_train = X[0:splitThreshold,:]
y_train = y[0:splitThreshold]

X_test = X[splitThreshold+duration:splitThreshold+duration*2 ,:]
y_test = y[splitThreshold+duration:splitThreshold+duration*2 ]
plt.hist(y_test,5)
plt.show()

In [None]:
best_rf = load('model/ann.joblib')

y_pred = best_rf.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2) Score: {r2}")
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

sqer = np.sqrt(np.square(np.subtract(y_pred, y_test)))

In [None]:
plt.hist(sqer)
plt.show()

In [None]:
values, base = np.histogram(sqer)
pdf = values / sum(values) * 100
cumulative = np.cumsum(pdf)
# plot the cumulative function
plt.plot(base[:-1], cumulative, c='blue')
plt.title('CDF')
plt.xlabel("Sai số dự báo biên độ khe giãn nở (mm)")
plt.ylabel("Phân bố sai số tích lũy (%)")
plt.grid(visible=True)
plt.show()

---
## Dummy

In [None]:
# take only original information
data = data.drop(['Season_heavy rain', 'Season_little rain', 'Period_am', 'Period_pm'], axis = 1)
quarter = 6 * 31 * 24
print(data['D mm'].mean(), data['D mm'].std())
# avgD = data.loc[0,'D mm']
# data['prevD'] = data['D mm'].shift(periods=quarter,fill_value=avgD)

print(data.describe())
X = data.drop(['D mm'], axis = 1).values
y = data['D mm'].values

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False, stratify=None)
splitThreshold = -6000
X_train = X[0:splitThreshold,:]
y_train = y[0:splitThreshold]

X_test = X[splitThreshold:,:]
y_test = y[splitThreshold:]

print(np.mean(y_test), np.std(y_test))

# dummy model, same config
# criterion='squared_error', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100
rf = RandomForestRegressor(criterion='squared_error', max_depth=5, min_samples_split=2, n_estimators=20);
# rf = RandomForestRegressor(n_estimators=50)
svr = SVR()
model = rf;
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2) Score: {r2}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

delta = np.abs(y_test - y_pred)
print(rmse)
figure, axis = plt.subplots(2, 2)
axis[0,0].hist(delta)
axis[1,0].hist(y_test)
axis[0,1].boxplot(y_test)
axis[1,1].boxplot(y_pred)

plt.show()