# Import essential library 

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.tree import plot_tree

In [2]:
#Data loading
train_data = pd.read_csv('/kaggle/input/kjord-training-testing-data/training_set.csv')
test_data = pd.read_csv('/kaggle/input/kjord-training-testing-data/testing_set.csv')

In [3]:
#Check data

train_data.head()

Unnamed: 0,Day,Month,Year,B01,B02,B03,B04,B05,B06,B07,Salinity
0,2017,9,2017,0.059865,0.108096,0.029403,0.02533,0.069976,0.160038,0.026542,14.66
1,2016,11,2016,0.362885,0.468959,0.36422,0.353,0.414434,0.945758,0.369478,14.53
2,2015,4,2015,0.019929,0.065494,0.020901,0.019087,0.021868,0.043231,0.019022,15.48
3,2019,5,2019,0.051056,0.12285,0.00553,0.002822,0.084914,0.065135,0.005853,12.51
4,2015,4,2015,0.009498,0.006927,0.000723,0.000723,0.000723,0.001265,0.000723,15.63


In [4]:
train_data.shape

(67669, 11)

In [5]:
#Check data

test_data.head()

Unnamed: 0,Day,Month,Year,B01,B02,B03,B04,B05,B06,B07,Salinity
0,2017,10,2017,0.039656,0.211165,0.035558,0.024633,0.095263,0.122293,0.034937,17.16
1,2015,12,2015,0.245583,0.234031,0.245137,0.219353,0.243156,1.340889,0.226793,23.0
2,2017,8,2017,0.057876,0.10592,0.028864,0.027373,0.070589,0.218493,0.027634,15.64
3,2017,9,2017,0.156547,0.309134,0.148183,0.140741,0.222221,0.565978,0.141779,16.12
4,2017,7,2017,0.393088,0.501025,0.37465,0.369656,0.420214,0.739151,0.369479,17.2


In [6]:
test_data.shape

(16918, 11)

# Model Development

In [7]:
target = 'Salinity'
feature = ['Day','Month', 'Year', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07']
#Establish X_train, y_train, X_test, y_test
X_train = train_data[feature]  
y_train = train_data[target]  
X_test = test_data[feature]  # Your features
y_test = test_data[target] 

In [8]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [700,1000],
    'max_depth': [3,5],
    'learning_rate': [0.005, 0.01],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'gamma': [1,5,10],
    'min_child_weight': [1,2],
    'reg_alpha': [0.01, 0.1, 1],
    'reg_lambda': [0.01, 0.1, 1]
}

In [9]:
# Initialize the XGBRegressor
xgb = XGBRegressor(random_state=42, objective='reg:squarederror')

In [10]:
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 1728 candidates, totalling 17280 fits
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.005, max_depth=3, min_child_weight=1, n_estimators=700, reg_alpha=0.01, reg_lambda=0.01, subsample=0.7; total time=   4.1s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.005, max_depth=3, min_child_weight=1, n_estimators=700, reg_alpha=0.01, reg_lambda=0.01, subsample=0.7; total time=   3.9s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.005, max_depth=3, min_child_weight=1, n_estimators=700, reg_alpha=0.01, reg_lambda=0.01, subsample=0.7; total time=   4.0s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.005, max_depth=3, min_child_weight=1, n_estimators=700, reg_alpha=0.01, reg_lambda=0.01, subsample=0.8; total time=   3.8s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.005, max_depth=3, min_child_weight=1, n_estimators=700, reg_alpha=0.01, reg_lambda=0.01, subsample=0.8; total time=   3.9s
[CV] END colsample_bytree=0.7, gamma=1

In [11]:
# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

Best Parameters: {'colsample_bytree': 0.8, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 1000, 'reg_alpha': 0.1, 'reg_lambda': 0.01, 'subsample': 0.7}


In [12]:
# Get the best estimator
best_xgb = grid_search.best_estimator_

In [13]:
# Make predictions
y_train_pred = best_xgb.predict(X_train)
y_test_pred = best_xgb.predict(X_test)

In [14]:
# Define a function to calculate evaluation metrics
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100 if not np.any(y_true == 0) else np.nan
    return rmse, mae, r2, mape

In [15]:
# Calculate metrics for training and testing sets
train_rmse, train_mae, train_r2, train_mape = calculate_metrics(y_train, y_train_pred)
test_rmse, test_mae, test_r2, test_mape = calculate_metrics(y_test, y_test_pred)

In [16]:
# Print the evaluation metrics
print("\nTraining Metrics:")
print(f"  RMSE: {train_rmse}")
print(f"  MAE: {train_mae}")
print(f"  R-squared: {train_r2}")
print(f"  MAPE: {train_mape:.2f}%")

print("\nTesting Metrics:")
print(f"  RMSE: {test_rmse}")
print(f"  MAE: {test_mae}")
print(f"  R-squared: {test_r2}")
print(f"  MAPE: {test_mape:.2f}%")


Training Metrics:
  RMSE: 0.8598040340607302
  MAE: 0.650970141369915
  R-squared: 0.8978947945330529
  MAPE: 4.17%

Testing Metrics:
  RMSE: 0.8762260042783936
  MAE: 0.6612368126267278
  R-squared: 0.8934305391908952
  MAPE: 4.25%
