# Import Libraries

In [None]:
# Core Libraries for Data Manipulation and Computation
import pandas as pd
import numpy as np

# Warning suppression for clean output
import warnings
warnings.filterwarnings('ignore')

# LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Importing train_test_split to split the dataset
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, KFold, RandomizedSearchCV

# Importing regression models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Importing evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Multi Output Regressor
from sklearn.multioutput import MultiOutputRegressor

# Importing the Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Importing the Dataset and Define the file path for the dataset
file = "/content/drive/MyDrive/Data Science/S4F AI & GREEN SKILLS/Capstone Project/dataset_original.csv"
data = pd.read_csv(file)

# Feature Engineering

In [None]:
# Categorical Encoding: Label Encoding for Ship Type and Fuel Type
label_encoders = {}

categorical_features = ['ship_type', 'fuel_type', 'month', 'weather_conditions']

for col in categorical_features:
    le = LabelEncoder()
    encoded_col = f"{col}_encoded"
    data[encoded_col] = le.fit_transform(data[col])
    label_encoders[col] = le
    print(f"Encoded '{col}' using Label Encoding.")
    print(f"Mapping for '{col}': {dict(enumerate(le.classes_))}\n")

Encoded 'ship_type' using Label Encoding.
Mapping for 'ship_type': {0: 'Fishing Trawler', 1: 'Oil Service Boat', 2: 'Surfer Boat', 3: 'Tanker Ship'}

Encoded 'fuel_type' using Label Encoding.
Mapping for 'fuel_type': {0: 'Diesel', 1: 'HFO'}

Encoded 'month' using Label Encoding.
Mapping for 'month': {0: 'April', 1: 'August', 2: 'December', 3: 'February', 4: 'January', 5: 'July', 6: 'June', 7: 'March', 8: 'May', 9: 'November', 10: 'October', 11: 'September'}

Encoded 'weather_conditions' using Label Encoding.
Mapping for 'weather_conditions': {0: 'Calm', 1: 'Moderate', 2: 'Stormy'}



In [None]:
# Display Processed Data
data.head()

Unnamed: 0,ship_id,ship_type,route_id,month,distance,fuel_type,fuel_consumption,CO2_emissions,weather_conditions,engine_efficiency,ship_type_encoded,fuel_type_encoded,month_encoded,weather_conditions_encoded
0,NG001,Oil Service Boat,Warri-Bonny,January,132.26,HFO,3779.77,10625.76,Stormy,92.14,1,1,4,2
1,NG001,Oil Service Boat,Port Harcourt-Lagos,February,128.52,HFO,4461.44,12779.73,Moderate,92.98,1,1,3,1
2,NG001,Oil Service Boat,Port Harcourt-Lagos,March,67.3,HFO,1867.73,5353.01,Calm,87.61,1,1,7,0
3,NG001,Oil Service Boat,Port Harcourt-Lagos,April,71.68,Diesel,2393.51,6506.52,Stormy,87.42,1,0,0,2
4,NG001,Oil Service Boat,Lagos-Apapa,May,134.32,HFO,4267.19,11617.03,Calm,85.61,1,1,8,0


# Modeling

In [None]:
features = ['distance', 'fuel_type_encoded', 'month_encoded', 'weather_conditions_encoded', 'ship_type_encoded']
targets = ['fuel_consumption', 'CO2_emissions']
X = data[features]
y = data[targets]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Random Forest

In [None]:
# Multi-output Random Forest
rf_multi = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
rf_multi.fit(X_train, y_train)
rf_preds = rf_multi.predict(X_test)

# Evaluation
rf_r2 = r2_score(y_test, rf_preds, multioutput='raw_values')
rf_rmse = [np.sqrt(mean_squared_error(y_test[target], rf_preds[:, i])) for i, target in enumerate(targets)]

# Fix zip bug
rf_r2_dict = dict(zip(targets, rf_r2))
rf_rmse_dict = dict(zip(targets, rf_rmse))

# Output
print("Random Forest R² Scores:")
for target in targets:
    print(f"{target.replace('_', ' ').title()}: {rf_r2_dict[target]:.3f}")

print("\nRandom Forest RMSEs:")
for target in targets:
    print(f"{target.replace('_', ' ').title()}: {rf_rmse_dict[target]:.2f}")

Random Forest R² Scores:
Fuel Consumption: 0.957
Co2 Emissions: 0.951

Random Forest RMSEs:
Fuel Consumption: 1074.64
Co2 Emissions: 3201.99


## XGBoost

In [None]:
# Multi-output Random Forest
xgb_multi = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
xgb_multi.fit(X_train, y_train)
xgb_preds = xgb_multi.predict(X_test)

# Evaluation
xgb_r2 = r2_score(y_test, xgb_preds, multioutput='raw_values')
xgb_rmse = [np.sqrt(mean_squared_error(y_test[target], xgb_preds[:, i])) for i, target in enumerate(targets)]

# Fix zip bug
xgb_r2_dict = dict(zip(targets, xgb_r2))
xgb_rmse_dict = dict(zip(targets, xgb_rmse))

# Output
print("XGBoost R² Scores:")
for target in targets:
    print(f"{target.replace('_', ' ').title()}: {xgb_r2_dict[target]:.3f}")

print("\nXGBoost RMSEs:")
for target in targets:
    print(f"{target.replace('_', ' ').title()}: {xgb_rmse_dict[target]:.2f}")

XGBoost R² Scores:
Fuel Consumption: 0.945
Co2 Emissions: 0.941

XGBoost RMSEs:
Fuel Consumption: 1215.91
Co2 Emissions: 3486.30


## Random Forest (Fine Tuned)

In [None]:
param_grid = {
    'estimator__n_estimators': [50, 100, 150],
    'estimator__max_depth': [None, 10, 20],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__max_features': ['sqrt', 'log2']
}

In [None]:
grid_search = GridSearchCV(estimator=rf_multi, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2', verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [None]:
best_param = grid_search.best_params_
best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(X_test)

In [None]:
# Evaluation
r2_tuned = r2_score(y_test, y_pred, multioutput='raw_values')
rmse_tuned = [np.sqrt(mean_squared_error(y_test[target], y_pred[:, i])) for i, target in enumerate(targets)]

# Dicts for easy printing
r2_tuned_dict = dict(zip(targets, r2_tuned))
rmse_tuned_dict = dict(zip(targets, rmse_tuned))

print("Random Forest (Fine Tuned) R² Scores:")
for target in targets:
    print(f"{target.replace('_', ' ').title()}: {r2_tuned_dict[target]:.3f}")

print("\nRandom Forest (Fine Tuned) RMSEs:")
for target in targets:
    print(f"{target.replace('_', ' ').title()}: {rmse_tuned_dict[target]:.2f}")

Random Forest (Fine Tuned) R² Scores:
Fuel Consumption: 0.961
Co2 Emissions: 0.956

Random Forest (Fine Tuned) RMSEs:
Fuel Consumption: 1029.47
Co2 Emissions: 3015.95


## XGBoost (Fine Tuned)

In [None]:
param_grid = {
  'estimator__n_estimators': [50, 100, 200],
  'estimator__learning_rate': [0.01, 0.1, 0.2],
  'estimator__max_depth': [3, 5, 10],
  'estimator__subsample': [0.6, 0.8, 1.0]
}

In [None]:
random_search = RandomizedSearchCV(estimator=xgb_multi, param_distributions=param_grid, cv=5, n_jobs=-1, scoring='r2', verbose=2, random_state=42, n_iter=20)

In [None]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
best_param = random_search.best_params_
best_xgb = random_search.best_estimator_

y_pred = best_xgb.predict(X_test)

In [None]:
# Evaluation
xgb_r2 = r2_score(y_test, y_pred, multioutput='raw_values')
xgb_rmse = [np.sqrt(mean_squared_error(y_test[target], y_pred[:, i])) for i, target in enumerate(targets)]

# Dicts for easy printing
xr2_tuned_dict = dict(zip(targets, xgb_r2))
xrmse_tuned_dict = dict(zip(targets, xgb_rmse))

print("XGBoost (Fine Tuned) R² Scores:")
for target in targets:
    print(f"{target.replace('_', ' ').title()}: {xr2_tuned_dict[target]:.3f}")

print("\nXGBoost (Fine Tuned) RMSEs:")
for target in targets:
    print(f"{target.replace('_', ' ').title()}: {xrmse_tuned_dict[target]:.2f}")

XGBoost (Fine Tuned) R² Scores:
Fuel Consumption: 0.957
Co2 Emissions: 0.951

XGBoost (Fine Tuned) RMSEs:
Fuel Consumption: 1070.32
Co2 Emissions: 3182.77


## Model Summary

In [None]:
model_results = {
    'Model': ['Random Forest', 'Random Forest', 'XGBoost', 'XGBoost', 'Random Forest (Fine Tuned)', 'Random Forest (Fine Tuned)', 'XGBoost (Fine Tuned)', 'XGBoost (Fine Tuned)'],
    'Target': ['Fuel Consumption', 'CO2 Emissions', 'Fuel Consumption', 'CO2 Emissions', 'Fuel Consumption', 'CO2 Emissions', 'Fuel Consumption', 'CO2 Emissions'],
    'R2 Score': [
        round(rf_r2_dict['fuel_consumption'], 3),
        round(rf_r2_dict['CO2_emissions'], 3),
        round(xgb_r2_dict['fuel_consumption'], 3),
        round(xgb_r2_dict['CO2_emissions'], 3),
        round(r2_tuned_dict['fuel_consumption'], 3),
        round(r2_tuned_dict['CO2_emissions'], 3),
        round(xr2_tuned_dict['fuel_consumption'], 3),
        round(xr2_tuned_dict['CO2_emissions'], 3)
    ],
    'RMSE': [
        round(rf_rmse_dict['fuel_consumption'], 2),
        round(rf_rmse_dict['CO2_emissions'], 2),
        round(xgb_rmse_dict['fuel_consumption'], 2),
        round(xgb_rmse_dict['CO2_emissions'], 2),
        round(rmse_tuned_dict['fuel_consumption'], 2),
        round(rmse_tuned_dict['CO2_emissions'], 2),
        round(xrmse_tuned_dict['fuel_consumption'], 3),
        round(xrmse_tuned_dict['CO2_emissions'], 3)
    ]
}

results_df = pd.DataFrame(model_results)

print("Model Performance Summary:\n")
print(results_df.to_string(index=False))

Model Performance Summary:

                     Model           Target  R2 Score     RMSE
             Random Forest Fuel Consumption     0.957 1074.640
             Random Forest    CO2 Emissions     0.951 3201.990
                   XGBoost Fuel Consumption     0.945 1215.910
                   XGBoost    CO2 Emissions     0.941 3486.300
Random Forest (Fine Tuned) Fuel Consumption     0.961 1029.470
Random Forest (Fine Tuned)    CO2 Emissions     0.956 3015.950
      XGBoost (Fine Tuned) Fuel Consumption     0.957 1070.322
      XGBoost (Fine Tuned)    CO2 Emissions     0.951 3182.769


## Best Model

In [None]:
models = [
    ("Random Forest", rf_multi, rf_r2_dict['fuel_consumption'], rf_rmse_dict['fuel_consumption']),
    ("XGBoost", xgb_multi, xgb_r2_dict['fuel_consumption'], xgb_rmse_dict['fuel_consumption']),
    ("Random Forest (Fine Tuned)", best_rf, r2_tuned_dict['fuel_consumption'], rmse_tuned_dict['fuel_consumption']),
    ("XGBoost (Fine Tuned)", best_xgb, xr2_tuned_dict['fuel_consumption'], xrmse_tuned_dict['fuel_consumption']),
]

best_model = sorted(models, key=lambda x: (-x[2], x[3]))[0]

# Confirm
print(f"Best Model: {best_model[0]} with R² = {best_model[2]:.4f}, RMSE = {best_model[3]:.4f}")

Best Model: Random Forest (Fine Tuned) with R² = 0.9606, RMSE = 1029.4705


# Model Accuracy and Validation

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
cv_results = {}
test_results = {}

for model_name, model, *_ in models:
    print(f"Evaluating: {model_name}")

    try:
        # Cross-validation predictions
        y_cv_pred = cross_val_predict(model, X_train, y_train, cv=kf)

        # Multi-output R²
        r2_cv = r2_score(y_train, y_cv_pred, multioutput='raw_values')
        rmse_cv = np.sqrt(np.mean((y_train - y_cv_pred) ** 2, axis=0))

        # Store results
        cv_results[model_name] = {
            'r2_scores': r2_cv,
            'rmse_scores': rmse_cv,
            'mean_r2': np.mean(r2_cv),
            'std_r2': np.std(r2_cv)
        }

        print(f" - CV Mean R²: {np.mean(r2_cv):.4f}")
        print(f" - CV RMSE (Fuel): {rmse_cv[0]:.2f}, RMSE (CO₂): {rmse_cv[1]:.2f}")

        # Test Set Evaluation
        y_pred = model.predict(X_test)
        r2_test = r2_score(y_test, y_pred, multioutput='raw_values')
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred, multioutput='raw_values'))
        mae_test = mean_absolute_error(y_test, y_pred)

        test_results[model_name] = {
            'r2': r2_test,
            'rmse': rmse_test,
            'mae': mae_test
        }

        print(f" - Test R²: Fuel = {r2_test[0]:.4f}, CO₂ = {r2_test[1]:.4f}")
        print(f" - Test RMSE: Fuel = {rmse_test[0]:.2f}, CO₂ = {rmse_test[1]:.2f}")
        print(f" - Test MAE: {mae_test:.2f}")

    except Exception as e:
        print(f"Failed on {model_name}: {e}")

Evaluating: Random Forest
 - CV Mean R²: 0.9496
 - CV RMSE (Fuel): 1061.45, RMSE (CO₂): 3047.23
 - Test R²: Fuel = 0.9571, CO₂ = 0.9506
 - Test RMSE: Fuel = 1074.64, CO₂ = 3201.99
 - Test MAE: 1252.41
Evaluating: XGBoost
 - CV Mean R²: 0.9412
 - CV RMSE (Fuel): 1170.25, RMSE (CO₂): 3225.05
 - Test R²: Fuel = 0.9451, CO₂ = 0.9415
 - Test RMSE: Fuel = 1215.91, CO₂ = 3486.30
 - Test MAE: 1325.88
Evaluating: Random Forest (Fine Tuned)
 - CV Mean R²: 0.9509
 - CV RMSE (Fuel): 1050.46, RMSE (CO₂): 3001.40
 - Test R²: Fuel = 0.9606, CO₂ = 0.9562
 - Test RMSE: Fuel = 1029.47, CO₂ = 3015.95
 - Test MAE: 1213.26
Evaluating: XGBoost (Fine Tuned)
 - CV Mean R²: 0.9535
 - CV RMSE (Fuel): 1021.67, RMSE (CO₂): 2917.96
 - Test R²: Fuel = 0.9574, CO₂ = 0.9512
 - Test RMSE: Fuel = 1070.32, CO₂ = 3182.77
 - Test MAE: 1286.29


In [None]:
# Summary Table
print("Cross-Validation Summary:")
print(f"{'Model':<30} {'Mean R²':<10} {'Std R²':<10}")
for model_name, scores in cv_results.items():
    print(f"{model_name:<30} {scores['mean_r2']:<10.4f} {scores['std_r2']:<10.4f}")

print("\nTest Set Performance Summary:")
print(f"{'Model':<30} {'RMSE (Fuel)':<15} {'RMSE (CO₂)':<15} {'MAE':<10}")
for model_name, scores in test_results.items():
    print(f"{model_name:<30} {scores['rmse'][0]:<15.2f} {scores['rmse'][1]:<15.2f} {scores['mae']:<10.2f}")

# Identify best model based on Fuel RMSE
best_model_name = min(test_results, key=lambda k: test_results[k]['rmse'][0])
best = test_results[best_model_name]

print(f"\nBest Model Based on Fuel RMSE: {best_model_name}")
print(f"RMSE (Fuel): {best['rmse'][0]:.2f}, R² (Fuel): {best['r2'][0]:.4f}, MAE: {best['mae']:.2f}")

Cross-Validation Summary:
Model                          Mean R²    Std R²    
Random Forest                  0.9496     0.0018    
XGBoost                        0.9412     0.0003    
Random Forest (Fine Tuned)     0.9509     0.0015    
XGBoost (Fine Tuned)           0.9535     0.0014    

Test Set Performance Summary:
Model                          RMSE (Fuel)     RMSE (CO₂)      MAE       
Random Forest                  1074.64         3201.99         1252.41   
XGBoost                        1215.91         3486.30         1325.88   
Random Forest (Fine Tuned)     1029.47         3015.95         1213.26   
XGBoost (Fine Tuned)           1070.32         3182.77         1286.29   

Best Model Based on Fuel RMSE: Random Forest (Fine Tuned)
RMSE (Fuel): 1029.47, R² (Fuel): 0.9606, MAE: 1213.26


In [None]:
rft_model = None

# Get the fine-tuned Random Forest model (wrapped)
for model_name, model, *_ in models:
    if model_name == 'Random Forest (Fine Tuned)':
        rft_model = model
        break

if rft_model is not None:
    # Extract individual regressors (one per target)
    estimators = rft_model.estimators_

    # Get feature importance for each target
    for idx, target in enumerate(targets):
        feature_importances = estimators[idx].feature_importances_
        feature_importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances
        }).sort_values(by='Importance', ascending=False)

        print(f"\nFeature Importance for '{target}':")
        print(feature_importance_df)

else:
    print("Fine-tuned Random Forest model not found.")


Feature Importance for 'fuel_consumption':
                      Feature  Importance
0                    distance    0.696560
4           ship_type_encoded    0.265910
2               month_encoded    0.022141
1           fuel_type_encoded    0.008182
3  weather_conditions_encoded    0.007207

Feature Importance for 'CO2_emissions':
                      Feature  Importance
0                    distance    0.697889
4           ship_type_encoded    0.260635
2               month_encoded    0.024828
1           fuel_type_encoded    0.008332
3  weather_conditions_encoded    0.008315


## Export Model

In [None]:
import joblib

joblib.dump(best_rf,'rf_tuned.pkl')

['rf_tuned.pkl']

# Prediction on Unseen Data

In [None]:
# Unseen Data
new_data = pd.DataFrame({
    'distance': [120],
    'fuel_type_encoded': [1], # HFO
    'month_encoded': [3], # February
    'weather_conditions_encoded': [1], # Moderate
    'ship_type_encoded': [2] # Surfer Boat
})

In [None]:
best_model = joblib.load('rf_tuned.pkl')

# Make prediction
prediction = best_model.predict(new_data)

In [None]:
# Output result
fuel, co2 = prediction[0]
print(f"Predicted Fuel Consumption: {fuel:.2f} liters")
print(f"Predicted CO₂ Emissions: {co2:.2f} kg")

Predicted Fuel Consumption: 2954.04 liters
Predicted CO₂ Emissions: 8368.63 kg


In [31]:
# Exporting the Dataset and Define the file path for the dataset
data.to_csv('/content/drive/MyDrive/Data Science/S4F AI & GREEN SKILLS/Capstone Project/dataset_exported.csv', index = None)