In [44]:
#Importing libraries required for this project
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline

In [45]:
#Load data from csv file to a data frame
ship_fuel_df = pd.read_csv("ship_fuel_efficiency.csv")
#print out the first 5 rows and all the columns
ship_fuel_df.head()

Unnamed: 0,ship_id,ship_type,route_id,month,distance,fuel_type,fuel_consumption,CO2_emissions,weather_conditions,engine_efficiency
0,NG001,Oil Service Boat,Warri-Bonny,January,132.26,HFO,3779.77,10625.76,Stormy,92.14
1,NG001,Oil Service Boat,Port Harcourt-Lagos,February,128.52,HFO,4461.44,12779.73,Moderate,92.98
2,NG001,Oil Service Boat,Port Harcourt-Lagos,March,67.3,HFO,1867.73,5353.01,Calm,87.61
3,NG001,Oil Service Boat,Port Harcourt-Lagos,April,71.68,Diesel,2393.51,6506.52,Stormy,87.42
4,NG001,Oil Service Boat,Lagos-Apapa,May,134.32,HFO,4267.19,11617.03,Calm,85.61


In [46]:
#Check for Nulls
print(pd.isnull(ship_fuel_df).sum())

ship_id               0
ship_type             0
route_id              0
month                 0
distance              0
fuel_type             0
fuel_consumption      0
CO2_emissions         0
weather_conditions    0
engine_efficiency     0
dtype: int64


In [47]:
#Data Cleaning
#Dropping ship_id since this is not required for the data analysis of this project
ship_fuel_df = ship_fuel_df.drop(columns=['ship_id'])
ship_fuel_df.head()

Unnamed: 0,ship_type,route_id,month,distance,fuel_type,fuel_consumption,CO2_emissions,weather_conditions,engine_efficiency
0,Oil Service Boat,Warri-Bonny,January,132.26,HFO,3779.77,10625.76,Stormy,92.14
1,Oil Service Boat,Port Harcourt-Lagos,February,128.52,HFO,4461.44,12779.73,Moderate,92.98
2,Oil Service Boat,Port Harcourt-Lagos,March,67.3,HFO,1867.73,5353.01,Calm,87.61
3,Oil Service Boat,Port Harcourt-Lagos,April,71.68,Diesel,2393.51,6506.52,Stormy,87.42
4,Oil Service Boat,Lagos-Apapa,May,134.32,HFO,4267.19,11617.03,Calm,85.61


In [57]:
#One hot encoding for categorial variables (ship_type, route_id, month, fuel_type, weather_conditions)
#Scaling for numerical variables
#Separate the data into inputs (X) and output (y) 
X = ship_fuel_df.drop('CO2_emissions', axis=1)
y = ship_fuel_df['CO2_emissions']

#Make a list for each variable type (Categorical and Numerical)
catcolumn = ['ship_type','route_id', 'month', 'fuel_type', 'weather_conditions']
numcolumn = ['distance', 'fuel_consumption', 'engine_efficiency']

#Creation of a column tranformer to scale numerical values and one hot endode the categorial values
transformer = ColumnTransformer(transformers=[("num", MinMaxScaler(), numcolumn), ('cat', OneHotEncoder(drop='first',sparse_output=False), catcolumn)])

#Using the transformer to apply scaling and one hot encoding in one shot (the type changes to numpy from pandas df)
X_np = transformer.fit_transform(X)

num_cols_transformed = numcolumn

# Getting categorical feature names after one-hot encoding
cat_cols_transformed = transformer.named_transformers_['cat'].get_feature_names_out(catcolumn)

# Combine both sets of columns
all_cols = np.concatenate([num_cols_transformed, cat_cols_transformed])

# Convert numpy array back to DataFrame WITH column names
X_df = pd.DataFrame(X_np, columns=all_cols)

In [58]:
#Split the data into train and test set (7:3)
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (1008, 23)
Shape of X_test:  (432, 23)


In [59]:
#Linear Regression
LinearModel = LinearRegression()
LinearModel.fit(X_train, y_train)
y_pred = LinearModel.predict(X_test)

#Calculation of R^2, MSE, MAE
Lr2 = r2_score(y_test, y_pred)
Lmse = mean_squared_error(y_test, y_pred)
Lmae = mean_absolute_error(y_test, y_pred)

print(f"r2: {Lr2}, mse: {Lmse}, mae: {Lmae}")

r2: 0.9950588226958281, mse: 955901.0104059286, mae: 609.3639790767888


In [60]:
#Generatng alpha from 0 to 10 in steps of 0.001
alpha = np.arange(0,10.001,0.001)
alpha = alpha.tolist()
results = {}
#for loop to test out all the alpha values to find the best value
for a in alpha:
    #Ridge Regression
    RidgeModel = Ridge(alpha=a)
    RidgeModel.fit(X_train, y_train)
    y_pred = RidgeModel.predict(X_test)

    #Calculation of R^2, MSE, MAE
    Rr2 = r2_score(y_test, y_pred)
    Rmse = mean_squared_error(y_test, y_pred)
    Rmae = mean_absolute_error(y_test, y_pred)

    results[a] = {'R2': Rr2, 'MSE': Rmse, 'MAE': Rmae}

#Output the best performing alpha and its performance matrics 
best_alpha_val = max(results, key=lambda a: results[a]['R2'])
print("Best alpha: ", best_alpha_val)
print("Performance for best alpha: ", results[best_alpha_val])

Best alpha:  0.0
Performance for best alpha:  {'R2': 0.9950588226958281, 'MSE': 955901.0104059273, 'MAE': 609.3639790767875}
