# Import essential library 

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
#Data loading
train_data = pd.read_csv('/kaggle/input/sfb-training-test/training_set.csv')
test_data = pd.read_csv('/kaggle/input/sfb-training-test/testing_set.csv')

In [3]:
#Check data

train_data.head()

Unnamed: 0,Day,Month,Year,MODIS_B01,MODIS_B02,MODIS_B03,MODIS_B04,MODIS_B05,MODIS_B06,MODIS_B07,Salinity
0,2006,11,2006,0.033225,0.033225,0.033225,0.033225,0.033225,0.045671,0.033225,27.55
1,2011,10,2011,0.0,0.0,0.0,0.0,0.0,0.006454,0.0,27.69
2,2014,6,2014,0.014285,0.014285,0.014285,0.014285,0.014285,0.015042,0.014285,30.91
3,2015,3,2015,0.011038,0.083436,0.011038,0.011038,0.038958,0.094582,0.011038,27.86
4,2012,12,2012,0.008225,0.014826,0.012445,0.008225,0.014068,0.039608,0.008225,28.4


In [4]:
train_data.shape

(5451, 11)

In [5]:
#Check data

test_data.head()

Unnamed: 0,Day,Month,Year,MODIS_B01,MODIS_B02,MODIS_B03,MODIS_B04,MODIS_B05,MODIS_B06,MODIS_B07,Salinity
0,2007,10,2007,0.002689,0.002689,0.002689,0.002689,0.002689,0.002689,0.002689,31.57
1,2010,10,2010,0.025649,0.025649,0.025649,0.025649,0.025649,0.072186,0.025649,29.53
2,2003,4,2003,0.084301,0.084301,0.084301,0.084301,0.084301,0.207236,0.084301,27.36
3,2009,6,2009,0.0,0.0,0.0,0.0,0.0,0.025172,0.0,28.74
4,2010,10,2010,0.0,0.013311,0.0,0.0,0.0,0.08257,0.0,30.75


In [6]:
test_data.shape

(1363, 11)

# Model Development

In [7]:
target = 'Salinity'
feature = ['Day','Month', 'Year','MODIS_B02', 'MODIS_B05', 'MODIS_B06']
#Establish X_train, y_train, X_test, y_test
X_train = train_data[feature]  
y_train = train_data[target]  
X_test = test_data[feature]  # Your features
y_test = test_data[target] 

## Train without polymonial

In [8]:
# Define a function to calculate evaluation metrics
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100 if not np.any(y_true == 0) else np.nan
    return rmse, mae, r2, mape

In [9]:
# Define base models
linear_model_no_poly = LinearRegression()
random_forest = RandomForestRegressor(bootstrap = True, max_depth =25, min_samples_leaf= 7, 
                                      min_samples_split= 25, n_estimators=700)
# Define meta-model
meta_model = XGBRegressor(random_state=42, objective='reg:squarederror')

In [10]:
def grid_search_stacking(X_train, y_train, stacking_regressor, param_grid, cv=5):
    grid_search = GridSearchCV(
        estimator=stacking_regressor,
        param_grid=param_grid,
        cv=cv,
        scoring='neg_mean_squared_error',
        verbose=2,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Negative MSE: {grid_search.best_score_}")
    return grid_search.best_estimator_

In [11]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'final_estimator__learning_rate': [0.01],  # Learning rate
    'final_estimator__n_estimators': [300,500],  # Number of boosting rounds
    'final_estimator__max_depth': [5,10],  # Maximum depth of trees
    'final_estimator__min_child_weight': [3],  # Regularization term
    'final_estimator__subsample': [0.7],  # Subsample ratio
    'final_estimator__colsample_bytree': [0.7]
}

In [12]:
# Stacking Regressor without polynomial features
stacking_no_poly = StackingRegressor(
    estimators=[
        ('linear', linear_model_no_poly),
        ('random_forest', random_forest)
    ],
    final_estimator=meta_model
)

In [13]:
# Perform GridSearchCV
best_model_no_poly = grid_search_stacking(X_train, y_train, stacking_no_poly, param_grid)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'final_estimator__colsample_bytree': 0.7, 'final_estimator__learning_rate': 0.01, 'final_estimator__max_depth': 5, 'final_estimator__min_child_weight': 3, 'final_estimator__n_estimators': 500, 'final_estimator__subsample': 0.7}
Best Negative MSE: -4.753941428704466


In [14]:
# Train the model
best_model_no_poly.fit(X_train, y_train)

In [15]:
# Make predictions
y_train_pred = best_model_no_poly.predict(X_train)
y_test_pred = best_model_no_poly.predict(X_test)

In [16]:
# Calculate metrics for training and testing sets
train_rmse, train_mae, train_r2, train_mape = calculate_metrics(y_train, y_train_pred)
test_rmse, test_mae, test_r2, test_mape = calculate_metrics(y_test, y_test_pred)

In [17]:
# Print the evaluation metrics
print("\nTraining Metrics:")
print(f"  RMSE: {train_rmse}")
print(f"  MAE: {train_mae}")
print(f"  R-squared: {train_r2}")
print(f"  MAPE: {train_mape:.2f}%")

print("\nTesting Metrics:")
print(f"  RMSE: {test_rmse}")
print(f"  MAE: {test_mae}")
print(f"  R-squared: {test_r2}")
print(f"  MAPE: {test_mape:.2f}%")


Training Metrics:
  RMSE: 1.9374387144580527
  MAE: 1.3428056440841516
  R-squared: 0.7519854633863383
  MAPE: 5.43%

Testing Metrics:
  RMSE: 2.057126207435509
  MAE: 1.4527693765126686
  R-squared: 0.7238460408508318
  MAPE: 5.79%


# Train with polynomial feature

In [18]:
# Stacking Regressor with polynomial features
linear_model_with_poly = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    ('linear_regression', LinearRegression())
])

In [19]:
stacking_with_poly = StackingRegressor(
    estimators=[
        ('linear', linear_model_with_poly),
        ('random_forest', random_forest)
    ],
    final_estimator=meta_model
)

In [20]:
# Transform the feature set for polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [21]:
# Perform GridSearchCV
best_model_with_poly = grid_search_stacking(X_train_poly, y_train, stacking_with_poly, param_grid)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'final_estimator__colsample_bytree': 0.7, 'final_estimator__learning_rate': 0.01, 'final_estimator__max_depth': 5, 'final_estimator__min_child_weight': 3, 'final_estimator__n_estimators': 500, 'final_estimator__subsample': 0.7}
Best Negative MSE: -4.706115286849649


In [22]:
# Train the model
best_model_with_poly.fit(X_train_poly, y_train)

[CV] END final_estimator__colsample_bytree=0.7, final_estimator__learning_rate=0.01, final_estimator__max_depth=5, final_estimator__min_child_weight=3, final_estimator__n_estimators=300, final_estimator__subsample=0.7; total time=  54.3s
[CV] END final_estimator__colsample_bytree=0.7, final_estimator__learning_rate=0.01, final_estimator__max_depth=5, final_estimator__min_child_weight=3, final_estimator__n_estimators=500, final_estimator__subsample=0.7; total time=  54.6s
[CV] END final_estimator__colsample_bytree=0.7, final_estimator__learning_rate=0.01, final_estimator__max_depth=5, final_estimator__min_child_weight=3, final_estimator__n_estimators=500, final_estimator__subsample=0.7; total time=  54.7s
[CV] END final_estimator__colsample_bytree=0.7, final_estimator__learning_rate=0.01, final_estimator__max_depth=10, final_estimator__min_child_weight=3, final_estimator__n_estimators=300, final_estimator__subsample=0.7; total time=  55.1s
[CV] END final_estimator__colsample_bytree=0.7,

In [23]:
# Make predictions
y_train_pred = best_model_with_poly.predict(X_train_poly)
y_test_pred = best_model_with_poly.predict(X_test_poly)

In [24]:
# Calculate metrics for training and testing sets
train_rmse, train_mae, train_r2, train_mape = calculate_metrics(y_train, y_train_pred)
test_rmse, test_mae, test_r2, test_mape = calculate_metrics(y_test, y_test_pred)

In [25]:
# Print the evaluation metrics
print("\nTraining Metrics:")
print(f"  RMSE: {train_rmse}")
print(f"  MAE: {train_mae}")
print(f"  R-squared: {train_r2}")
print(f"  MAPE: {train_mape:.2f}%")

print("\nTesting Metrics:")
print(f"  RMSE: {test_rmse}")
print(f"  MAE: {test_mae}")
print(f"  R-squared: {test_r2}")
print(f"  MAPE: {test_mape:.2f}%")


Training Metrics:
  RMSE: 1.8928850834743187
  MAE: 1.3013415709435885
  R-squared: 0.7632610665109848
  MAPE: 5.31%

Testing Metrics:
  RMSE: 2.0596339787513376
  MAE: 1.4220152518929405
  R-squared: 0.7231723309974769
  MAPE: 5.75%
