# Import essential library 

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
#Data loading
train_data = pd.read_csv('/kaggle/input/kjord-training-testing-data/training_set.csv')
test_data = pd.read_csv('/kaggle/input/kjord-training-testing-data/testing_set.csv')

In [3]:
#Check data

train_data.head()

Unnamed: 0,Day,Month,Year,B01,B02,B03,B04,B05,B06,B07,Salinity
0,2017,9,2017,0.059865,0.108096,0.029403,0.02533,0.069976,0.160038,0.026542,14.66
1,2016,11,2016,0.362885,0.468959,0.36422,0.353,0.414434,0.945758,0.369478,14.53
2,2015,4,2015,0.019929,0.065494,0.020901,0.019087,0.021868,0.043231,0.019022,15.48
3,2019,5,2019,0.051056,0.12285,0.00553,0.002822,0.084914,0.065135,0.005853,12.51
4,2015,4,2015,0.009498,0.006927,0.000723,0.000723,0.000723,0.001265,0.000723,15.63


In [4]:
train_data.shape

(67669, 11)

In [5]:
#Check data

test_data.head()

Unnamed: 0,Day,Month,Year,B01,B02,B03,B04,B05,B06,B07,Salinity
0,2017,10,2017,0.039656,0.211165,0.035558,0.024633,0.095263,0.122293,0.034937,17.16
1,2015,12,2015,0.245583,0.234031,0.245137,0.219353,0.243156,1.340889,0.226793,23.0
2,2017,8,2017,0.057876,0.10592,0.028864,0.027373,0.070589,0.218493,0.027634,15.64
3,2017,9,2017,0.156547,0.309134,0.148183,0.140741,0.222221,0.565978,0.141779,16.12
4,2017,7,2017,0.393088,0.501025,0.37465,0.369656,0.420214,0.739151,0.369479,17.2


In [6]:
test_data.shape

(16918, 11)

# Model Development

In [7]:
target = 'Salinity'
feature = ['Day','Month', 'Year', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07']
#Establish X_train, y_train, X_test, y_test
X_train = train_data[feature]  
y_train = train_data[target]  
X_test = test_data[feature]  # Your features
y_test = test_data[target] 

## Train without polymonial

In [8]:
# Define a function to calculate evaluation metrics
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100 if not np.any(y_true == 0) else np.nan
    return rmse, mae, r2, mape

In [9]:
# Define base models
linear_model_no_poly = LinearRegression()
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
# Define meta-model
meta_model = XGBRegressor(random_state=42, objective='reg:squarederror')

In [10]:
# Stacking regressor without polynomial features
stacking_no_poly = StackingRegressor(
    estimators=[
        ('linear', linear_model_no_poly),
        ('random_forest', random_forest)
    ],
    final_estimator=meta_model
)

In [11]:
# Train the model
stacking_no_poly.fit(X_train, y_train)

In [12]:
# Make predictions
y_train_pred = stacking_no_poly.predict(X_train)
y_test_pred = stacking_no_poly.predict(X_test)

In [13]:
# Calculate metrics for training and testing sets
train_rmse, train_mae, train_r2, train_mape = calculate_metrics(y_train, y_train_pred)
test_rmse, test_mae, test_r2, test_mape = calculate_metrics(y_test, y_test_pred)

In [14]:
# Print the evaluation metrics
print("\nTraining Metrics:")
print(f"  RMSE: {train_rmse}")
print(f"  MAE: {train_mae}")
print(f"  R-squared: {train_r2}")
print(f"  MAPE: {train_mape:.2f}%")

print("\nTesting Metrics:")
print(f"  RMSE: {test_rmse}")
print(f"  MAE: {test_mae}")
print(f"  R-squared: {test_r2}")
print(f"  MAPE: {test_mape:.2f}%")


Training Metrics:
  RMSE: 0.576270910355401
  MAE: 0.37624099617853934
  R-squared: 0.9541327902490334
  MAPE: 2.43%

Testing Metrics:
  RMSE: 0.5848926006989125
  MAE: 0.38107156031999007
  R-squared: 0.9525154040074598
  MAPE: 2.47%


# Train with polynomial feature

In [15]:
# Add polynomial features to Linear Regression
linear_model_with_poly = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    ('linear_regression', LinearRegression())
])

In [16]:
# Stacking regressor with polynomial features
stacking_with_poly = StackingRegressor(
    estimators=[
        ('linear', linear_model_with_poly),
        ('random_forest', random_forest)
    ],
    final_estimator=meta_model
)


In [17]:

# Transform the feature set for polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [18]:
# Train the model
stacking_with_poly.fit(X_train_poly, y_train)

In [19]:
# Make predictions
y_train_pred = stacking_with_poly.predict(X_train_poly)
y_test_pred = stacking_with_poly.predict(X_test_poly)

In [20]:
# Calculate metrics for training and testing sets
train_rmse, train_mae, train_r2, train_mape = calculate_metrics(y_train, y_train_pred)
test_rmse, test_mae, test_r2, test_mape = calculate_metrics(y_test, y_test_pred)

In [21]:
# Print the evaluation metrics
print("\nTraining Metrics:")
print(f"  RMSE: {train_rmse}")
print(f"  MAE: {train_mae}")
print(f"  R-squared: {train_r2}")
print(f"  MAPE: {train_mape:.2f}%")

print("\nTesting Metrics:")
print(f"  RMSE: {test_rmse}")
print(f"  MAE: {test_mae}")
print(f"  R-squared: {test_r2}")
print(f"  MAPE: {test_mape:.2f}%")


Training Metrics:
  RMSE: 0.5791623441478592
  MAE: 0.37895726400307517
  R-squared: 0.9536713589580875
  MAPE: 2.45%

Testing Metrics:
  RMSE: 0.5880880031668062
  MAE: 0.38420703598421113
  R-squared: 0.9519951482562911
  MAPE: 2.50%
