In [44]:
#Importing libraries required for this project
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score

In [16]:
#Load data from csv file to a data frame
ship_fuel_df = pd.read_csv("ship_fuel_efficiency.csv")
#print out the first 5 rows and all the columns
ship_fuel_df.head()

Unnamed: 0,ship_id,ship_type,route_id,month,distance,fuel_type,fuel_consumption,CO2_emissions,weather_conditions,engine_efficiency
0,NG001,Oil Service Boat,Warri-Bonny,January,132.26,HFO,3779.77,10625.76,Stormy,92.14
1,NG001,Oil Service Boat,Port Harcourt-Lagos,February,128.52,HFO,4461.44,12779.73,Moderate,92.98
2,NG001,Oil Service Boat,Port Harcourt-Lagos,March,67.3,HFO,1867.73,5353.01,Calm,87.61
3,NG001,Oil Service Boat,Port Harcourt-Lagos,April,71.68,Diesel,2393.51,6506.52,Stormy,87.42
4,NG001,Oil Service Boat,Lagos-Apapa,May,134.32,HFO,4267.19,11617.03,Calm,85.61


In [17]:
#Check for Nulls
print(pd.isnull(ship_fuel_df).sum())

ship_id               0
ship_type             0
route_id              0
month                 0
distance              0
fuel_type             0
fuel_consumption      0
CO2_emissions         0
weather_conditions    0
engine_efficiency     0
dtype: int64


In [18]:
#Data Cleaning
#Dropping ship_id since this is not required for the data analysis of this project
ship_fuel_df = ship_fuel_df.drop(columns=['ship_id'])
ship_fuel_df.head()

Unnamed: 0,ship_type,route_id,month,distance,fuel_type,fuel_consumption,CO2_emissions,weather_conditions,engine_efficiency
0,Oil Service Boat,Warri-Bonny,January,132.26,HFO,3779.77,10625.76,Stormy,92.14
1,Oil Service Boat,Port Harcourt-Lagos,February,128.52,HFO,4461.44,12779.73,Moderate,92.98
2,Oil Service Boat,Port Harcourt-Lagos,March,67.3,HFO,1867.73,5353.01,Calm,87.61
3,Oil Service Boat,Port Harcourt-Lagos,April,71.68,Diesel,2393.51,6506.52,Stormy,87.42
4,Oil Service Boat,Lagos-Apapa,May,134.32,HFO,4267.19,11617.03,Calm,85.61


In [None]:
#One hot encoding for categorial variables (ship_type, route_id, month, fuel_type, weather_conditions)
#Scaling for numerical variables

#Make a list for each variable type (Categorical and Numerical)
catcolumn = ['ship_type','route_id', 'month', 'fuel_type', 'weather_conditions']
numcolumn = ['distance', 'fuel_consumption', 'CO2_emissions', 'engine_efficiency']

#Creation of a column tranformer to scale numerical values and one hot endode the categorial values
processor = ColumnTransformer(transformers=[("num", StandardScaler(), numcolumn), ('cat', OneHotEncoder(), catcolumn)])

#Using the transformer to apply scaling and one hot encoding in one shot (the type changes to numpy from pandas df)
ship_fuel_processed_np = processor.fit_transform(ship_fuel_df)

#Getting columns's names (Changes the features's names)
features = processor.get_feature_names_out() 

#Converting back to pandas df from numpy
ship_fuel_processed_df = pd.DataFrame(ship_fuel_processed_np, columns=features)

In [None]:
#Print out the features' names since changes are expected
print(ship_fuel_processed_df.columns)

Index(['num__distance', 'num__fuel_consumption', 'num__CO2_emissions',
       'num__engine_efficiency', 'cat__ship_type_Fishing Trawler',
       'cat__ship_type_Oil Service Boat', 'cat__ship_type_Surfer Boat',
       'cat__ship_type_Tanker Ship', 'cat__route_id_Escravos-Lagos',
       'cat__route_id_Lagos-Apapa', 'cat__route_id_Port Harcourt-Lagos',
       'cat__route_id_Warri-Bonny', 'cat__month_April', 'cat__month_August',
       'cat__month_December', 'cat__month_February', 'cat__month_January',
       'cat__month_July', 'cat__month_June', 'cat__month_March',
       'cat__month_May', 'cat__month_November', 'cat__month_October',
       'cat__month_September', 'cat__fuel_type_Diesel', 'cat__fuel_type_HFO',
       'cat__weather_conditions_Calm', 'cat__weather_conditions_Moderate',
       'cat__weather_conditions_Stormy'],
      dtype='object')


In [43]:
#Split data into training and test set (7:3)
#Separate the data into inputs (X) and output (y) 
X = ship_fuel_processed_df.drop('num__CO2_emissions', axis=1)
y = ship_fuel_processed_df['num__CO2_emissions']

#Split the data into train and test set (7:3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (1008, 28)
Shape of X_test:  (432, 28)


In [45]:
#Linear Regression
LinearModel = LinearRegression()
LinearModel.fit(X_train, y_train)
y_pred = LinearModel.predict(X_test)

#Calculation of R^2, MSE, MAE
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"r2: {r2}, mse: {mse}, mae: {mae}")

print(f"Model Coeff: {LinearModel.coef_}")
print(f"Model Intercept: {LinearModel.intercept_}")

r2: 0.9950588226958281, mse: 0.005196433188175669, mae: 0.04492861097840247
Model Coeff: [ 4.33945257e-03  9.94377873e-01  3.55546094e-03  1.46396095e-03
 -2.15989526e-03  2.48454017e-03 -1.78860586e-03  4.17292083e-03
 -4.03668540e-03 -5.94416434e-03  5.80792890e-03  4.44951290e-04
  1.34670571e-04  2.08488355e-03  6.73251737e-03  7.81170065e-03
 -4.23708020e-03 -5.96126698e-03  1.93685551e-03  9.12261910e-03
 -6.22728102e-03 -8.02348759e-03 -3.81908226e-03 -2.61596685e-03
  2.61596685e-03  2.19277946e-03 -2.59127998e-03  3.98500519e-04]
Model Intercept: 0.0006629829546923281
