In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor #predicting continuous numeric values
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

Build a Machine Learning Pipeline to predict **Fuel Efficiency (km/l)** of a car based on **Engine Size, Horse Power**, and **Fuel Type**.

In [2]:
data = pd.DataFrame({
    "EngineSize": [1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.0, 2.2, 2.4, 2.6],
    "HorsePower": [68, 75, 90, 100, 110, 115, np.nan, 130, 150, 160],
    "FuelType": ["Petrol", "Diesel", "Petrol", "Diesel", "Petrol",
                 "Petrol", 'CNg', "Diesel", "Petrol", "Diesel"],
    "FuelEfficiency": [20, 22, 19, 25, 18, 21, 23, 26, 17, 24]
})
data

Unnamed: 0,EngineSize,HorsePower,FuelType,FuelEfficiency
0,1.0,68.0,Petrol,20
1,1.2,75.0,Diesel,22
2,1.4,90.0,Petrol,19
3,1.6,100.0,Diesel,25
4,1.8,110.0,Petrol,18
5,2.0,115.0,Petrol,21
6,2.0,,CNg,23
7,2.2,130.0,Diesel,26
8,2.4,150.0,Petrol,17
9,2.6,160.0,Diesel,24


In [3]:
x = data[['EngineSize','HorsePower','FuelType']]
y = data['FuelEfficiency']

In [4]:
num=['EngineSize','HorsePower']
cat=['FuelType']
numeric = Pipeline(
    steps=[
        ("i", SimpleImputer(strategy="mean")),
        ("s", StandardScaler()) ])
categorical = Pipeline(steps = [ #encode string to number
    ("i", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
 ])

In [5]:
preprocess = ColumnTransformer([("num",numeric,num),("cat",categorical,cat)])

In [6]:
pipe = Pipeline(steps=[("prep", preprocess),
 #("model", GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=3))
 #("model", LinearRegression())
 #("model", RandomForestRegressor(n_estimators=100, random_state=42))
 #("model", DecisionTreeRegressor(random_state=42))
 #("model", SVR(kernel = 'rbf', gamma=0.1))
 ("model", KNeighborsRegressor())
 #("model", LogisticRegression())
 ])

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

pipe.fit(x_train,y_train)

In [8]:
y_pred = pipe.predict(x_test)
print(y_pred)

[22.4 21. ]


In [9]:
mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
r2= r2_score(y_test,y_pred)

print("mae",round(mae,2))
print("mse",round(mse,2))
print("rme",round(rmse,2))
print("r2",round(r2,2))

mae 3.2
mse 15.08
rme 3.88
r2 -1.41


Result of Linear Regression =
mae 2.72
mse 8.05
rme 2.84
r2 -0.29

Result of random forest regression = mae 3.09
mse 16.3
rme 4.04
r2 -1.61

Result of decision tree = mae 3.5
mse 12.5
rme 3.54
r2 -1.0

Result of SVR =
mae 3.03
mse 17.2
rme 4.15
r2 -1.75

Result of Logistic Regression =
mae 5.0
mse 29.0
rme 5.39
r2 -3.64

Result of KNN =
mae 4.0
mse 27.11
rme 5.21
r2 -3.34

Result of Gradient Booster =
mae 3.85
mse 15.56
rme 3.95
r2 -1.49

Another Data result

Linear = mae 2.31
mse 10.37
rme 3.22
r2 -0.66

Logistic = mae 5.0
mse 29.0
rme 5.39
r2 -3.64

Forest = mae 2.89
mse 10.14
rme 3.18
r2 -0.62

Decision Tree = mae 3.5
mse 12.5
rme 3.54
r2 -1.0

Gradient = mae 3.5
mse 12.5
rme 3.54
r2 -1.0

SVR = mae 3.19
mse 16.44
rme 4.06
r2 -1.63

KNN = mae 3.2
mse 15.08
rme 3.88
r2 -1.41