In [378]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor

from warnings import filterwarnings
filterwarnings('ignore')

# Data Preprocessing

In [379]:
dataframe = pd.read_csv("CarDataFrame.csv")
df = dataframe.copy()
df

Unnamed: 0.1,Unnamed: 0,Title,Useable Battery,0 - 100,Top Speed,Range,Efficiency,Fastcharge,Price in Germany,Price in Netherlands,Price in UK
0,0,Mini Cooper SE,49 kwh,6.7 sec,170 km/h,310 km,158 Wh/km,440 km/h,"€36,900","* €40,000","£34,500"
1,1,Mini Cooper E,37 kwh,7.3 sec,160 km/h,235 km,157 Wh/km,360 km/h,"* €33,000","* €35,000","£30,000"
2,2,BMW iX1 eDrive20,64.7 kwh,8.3 sec,170 km/h,385 km,168 Wh/km,550 km/h,"€47,900","* €50,000","£44,560"
3,3,Mini Countryman SE ALL4,64.7 kwh,5.6 sec,180 km/h,370 km,175 Wh/km,530 km/h,"* €50,000","* €53,000","£47,180"
4,4,Mini Countryman E,64.7 kwh,8.6 sec,170 km/h,380 km,170 Wh/km,550 km/h,"* €43,500","* €46,000","£42,080"
...,...,...,...,...,...,...,...,...,...,...,...
360,360,Tesla Model Y Performance,75 kwh,3.7 sec,250 km/h,415 km,181 Wh/km,640 km/h,"€63,667","€60,993","£59,990"
361,361,Audi Q4 e-tron 35,52 kwh,9.0 sec,160 km/h,285 km,182 Wh/km,360 km/h,,"€52,105",
362,362,Renault Zoe ZE50 R110,52 kwh,11.4 sec,135 km/h,315 km,165 Wh/km,230 km/h,"€36,840","€34,895",
363,363,Audi e-tron GT RS,85 kwh,3.3 sec,250 km/h,405 km,210 Wh/km,1000 km/h,"€146,050","€154,835","£119,000"


In [380]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            365 non-null    int64 
 1   Title                 365 non-null    object
 2   Useable Battery       365 non-null    object
 3   0 - 100               365 non-null    object
 4   Top Speed             365 non-null    object
 5   Range                 365 non-null    object
 6   Efficiency            365 non-null    object
 7   Fastcharge            365 non-null    object
 8   Price in Germany      326 non-null    object
 9   Price in Netherlands  312 non-null    object
 10  Price in UK           229 non-null    object
dtypes: int64(1), object(10)
memory usage: 31.5+ KB


In [381]:
df = df.dropna()
df = df.drop(["Unnamed: 0", "Price in UK", "Price in Netherlands", "Title"], axis=1)
df

Unnamed: 0,Useable Battery,0 - 100,Top Speed,Range,Efficiency,Fastcharge,Price in Germany
0,49 kwh,6.7 sec,170 km/h,310 km,158 Wh/km,440 km/h,"€36,900"
1,37 kwh,7.3 sec,160 km/h,235 km,157 Wh/km,360 km/h,"* €33,000"
2,64.7 kwh,8.3 sec,170 km/h,385 km,168 Wh/km,550 km/h,"€47,900"
3,64.7 kwh,5.6 sec,180 km/h,370 km,175 Wh/km,530 km/h,"* €50,000"
4,64.7 kwh,8.6 sec,170 km/h,380 km,170 Wh/km,550 km/h,"* €43,500"
...,...,...,...,...,...,...,...
356,90 kwh,12.1 sec,160 km/h,310 km,290 Wh/km,310 km/h,"€71,626"
357,28.5 kwh,8.3 sec,145 km/h,170 km,168 Wh/km,180 km/h,"€39,900"
358,52 kwh,9.5 sec,140 km/h,310 km,168 Wh/km,230 km/h,"€37,840"
360,75 kwh,3.7 sec,250 km/h,415 km,181 Wh/km,640 km/h,"€63,667"


In [382]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 197 entries, 0 to 363
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Useable Battery   197 non-null    object
 1   0 - 100           197 non-null    object
 2   Top Speed         197 non-null    object
 3   Range             197 non-null    object
 4   Efficiency        197 non-null    object
 5   Fastcharge        197 non-null    object
 6   Price in Germany  197 non-null    object
dtypes: object(7)
memory usage: 12.3+ KB


In [383]:
df["Useable Battery"] = df["Useable Battery"].apply(lambda x: float(x[0:-4]))
df["0 - 100"] = df["0 - 100"].apply(lambda x: float(x[0:-4]))
df["Top Speed"] = df["Top Speed"].apply(lambda x: float(x[0:-5]))
df["Range"] = df["Range"].apply(lambda x: float(x[0:-3]))
df["Efficiency"] = df["Efficiency"].apply(lambda x: float(x[0:-6]))
df["Fastcharge"] = df["Fastcharge"].apply(lambda x: float(x[0:-5]))
df["Price in Germany"] = df["Price in Germany"].apply(lambda x: int(x.replace("*", "").strip()[1:].replace(",", "")))

df

Unnamed: 0,Useable Battery,0 - 100,Top Speed,Range,Efficiency,Fastcharge,Price in Germany
0,49.0,6.7,170.0,310.0,158.0,440.0,36900
1,37.0,7.3,160.0,235.0,157.0,360.0,33000
2,64.7,8.3,170.0,385.0,168.0,550.0,47900
3,64.7,5.6,180.0,370.0,175.0,530.0,50000
4,64.7,8.6,170.0,380.0,170.0,550.0,43500
...,...,...,...,...,...,...,...
356,90.0,12.1,160.0,310.0,290.0,310.0,71626
357,28.5,8.3,145.0,170.0,168.0,180.0,39900
358,52.0,9.5,140.0,310.0,168.0,230.0,37840
360,75.0,3.7,250.0,415.0,181.0,640.0,63667


# Machine Learning

In [384]:
df = df
y_variable = "Price in Germany"

In [385]:
y = df[y_variable]
X = df.drop(y_variable, axis=1)
test_size = 0.20

# Train Test Split

In [386]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Linear Regression

In [387]:
lr_model = LinearRegression().fit(X_train, y_train)

In [388]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, lr_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, lr_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(lr_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, lr_model.predict(X_test)))

Mean Squared Error - Train: 301554733.79504985
Mean Squared Error - Test: 2287297202.930994
Cross Validation Score: 0.6841856799210113
R2 Score: 0.40278021519574236


# Ridge

In [389]:
ridge_model = Ridge().fit(X_train, y_train)

In [390]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, ridge_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, ridge_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(ridge_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, ridge_model.predict(X_test)))

Mean Squared Error - Train: 301554784.9810159
Mean Squared Error - Test: 2287420016.8729844
Cross Validation Score: 0.6842866207859105
R2 Score: 0.40274814812727733


# Model Tuning - Ridge

In [391]:
ridge_model_params = {"alpha":[1,2,3,4,5]}
tuned_ridge_model = GridSearchCV(ridge_model, ridge_model_params, cv = 10, n_jobs=-1).fit(X_train,y_train)

In [392]:
print("Best Params:", tuned_ridge_model.best_params_)
print("Mean Squared Error - Train:", mean_squared_error(y_train, tuned_ridge_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, tuned_ridge_model.predict(X_test)))
print("R2 Score:", r2_score(y_test, tuned_ridge_model.predict(X_test)))

Best Params: {'alpha': 5}
Mean Squared Error - Train: 301555955.016799
Mean Squared Error - Test: 2287898909.4350343
R2 Score: 0.40262310792153344


# Lasso

In [393]:
lasso_model = Lasso().fit(X_train, y_train)

In [394]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, lasso_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, lasso_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(lasso_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, lasso_model.predict(X_test)))

Mean Squared Error - Train: 301554734.88365686
Mean Squared Error - Test: 2287315545.000069
Cross Validation Score: 0.6841920608323871
R2 Score: 0.4027754260295021


# Model Tuning - Lasso

In [395]:
lasso_model_params = {"alpha":[1,2,3,4,5], "precompute":[False,True]}
tuned_lasso_model = GridSearchCV(lasso_model, lasso_model_params, cv = 10, n_jobs=-1).fit(X_train,y_train)

In [396]:
print("Best Params:", tuned_lasso_model.best_params_)
print("Mean Squared Error - Train:", mean_squared_error(y_train, tuned_lasso_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, tuned_lasso_model.predict(X_test)))
print("R2 Score:", r2_score(y_test, tuned_lasso_model.predict(X_test)))

Best Params: {'alpha': 5, 'precompute': True}
Mean Squared Error - Train: 301554761.03703296
Mean Squared Error - Test: 2287389073.6265154
R2 Score: 0.40275622749666296


# ElasticNet

In [397]:
elasticnet_model = ElasticNet().fit(X_train, y_train)

In [398]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, elasticnet_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, elasticnet_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(elasticnet_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, elasticnet_model.predict(X_test)))

Mean Squared Error - Train: 301703446.6322094
Mean Squared Error - Test: 2294241699.0162983
Cross Validation Score: 0.6888601387825926
R2 Score: 0.40096698757830607


# Model Tuning - ElasticNet

In [399]:
elasticnet_model_params = {"alpha":[1,2,3,4,5], "precompute":[False,True]}
tuned_elasticnet_model = GridSearchCV(elasticnet_model, elasticnet_model_params, cv = 10, n_jobs=-1).fit(X_train,y_train)

In [400]:
print("Best Params:", tuned_elasticnet_model.best_params_)
print("Mean Squared Error - Train:", mean_squared_error(y_train, tuned_elasticnet_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, tuned_elasticnet_model.predict(X_test)))
print("R2 Score:", r2_score(y_test, tuned_elasticnet_model.predict(X_test)))

Best Params: {'alpha': 5, 'precompute': False}
Mean Squared Error - Train: 302279932.87436855
Mean Squared Error - Test: 2304502548.4620185
R2 Score: 0.39828785069568784


# KNN

In [401]:
knn_model = KNeighborsRegressor().fit(X_train, y_train)

In [402]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, knn_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, knn_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(knn_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, knn_model.predict(X_test)))

Mean Squared Error - Train: 322981844.9235669
Mean Squared Error - Test: 2437707066.4830008
Cross Validation Score: 0.5145462678855944
R2 Score: 0.3635077733688299


# Model Tuning - KNN

In [403]:
knn_params = {'n_neighbors': np.arange(1,30,1)}
tuned_knn_model = GridSearchCV(knn_model, knn_params, cv = 10, n_jobs=-1).fit(X_train,y_train)

In [404]:
print("Best Params:", tuned_knn_model.best_params_)
print("Mean Squared Error - Train:", mean_squared_error(y_train, tuned_knn_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, tuned_knn_model.predict(X_test)))
print("R2 Score:", r2_score(y_test, tuned_knn_model.predict(X_test)))

Best Params: {'n_neighbors': 2}
Mean Squared Error - Train: 110858454.6799363
Mean Squared Error - Test: 2689118669.90625
R2 Score: 0.297863490893722


# SVR

In [405]:
svr_model = SVR().fit(X_train, y_train)

In [406]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, svr_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, svr_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(svr_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, svr_model.predict(X_test)))

Mean Squared Error - Train: 1645083865.7613447
Mean Squared Error - Test: 4647868638.952452
Cross Validation Score: -0.16960710260565984
R2 Score: -0.21357167962111334


# Model Tuning - SVR

In [407]:
svr_params = {"C": np.arange(0.1,2,0.1), "kernel": ["linear","rbf"]}
tuned_svr_model = GridSearchCV(svr_model, svr_params, cv = 10, n_jobs=-1).fit(X_train,y_train)

In [408]:
print("Best Params:", tuned_svr_model.best_params_)
print("Mean Squared Error - Train:", mean_squared_error(y_train, tuned_svr_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, tuned_svr_model.predict(X_test)))
print("R2 Score:", r2_score(y_test, tuned_svr_model.predict(X_test)))

Best Params: {'C': 1.9000000000000001, 'kernel': 'linear'}
Mean Squared Error - Train: 345447588.6039668
Mean Squared Error - Test: 2494474896.495075
R2 Score: 0.34868553200020846


# CART

In [409]:
cart_model = DecisionTreeRegressor().fit(X_train, y_train)

In [410]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, cart_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, cart_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(cart_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, cart_model.predict(X_test)))

Mean Squared Error - Train: 1320840.059447983
Mean Squared Error - Test: 2686103405.65
Cross Validation Score: 0.6023051215181456
R2 Score: 0.2986507849401354


# Model Tuning - CART

In [411]:
cart_params = {"max_depth": np.arange(1,10),
	        "min_samples_split": np.arange(1,10),
            "max_leaf_nodes": np.arange(1,10)}
tuned_cart_model = GridSearchCV(cart_model, cart_params, cv = 10, n_jobs=-1).fit(X_train,y_train)

In [412]:
print("Best Params:", tuned_cart_model.best_params_)
print("Mean Squared Error - Train:", mean_squared_error(y_train, tuned_cart_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, tuned_cart_model.predict(X_test)))
print("R2 Score:", r2_score(y_test, tuned_cart_model.predict(X_test)))

Best Params: {'max_depth': 7, 'max_leaf_nodes': 9, 'min_samples_split': 2}
Mean Squared Error - Train: 113306301.67774312
Mean Squared Error - Test: 2499756861.001142
R2 Score: 0.347306396091847


# Bagging

In [413]:
bagging_model = BaggingRegressor().fit(X_train, y_train)

In [414]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, bagging_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, bagging_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(bagging_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, bagging_model.predict(X_test)))

Mean Squared Error - Train: 56886940.64879919
Mean Squared Error - Test: 2496442438.3740625
Cross Validation Score: 0.7573051230949108
R2 Score: 0.34817180123708047


# Model Tuning - Bagging

In [415]:
bagging_params = {"n_estimators" : [10,50,100,200]}
tuned_bagging_model = GridSearchCV(bagging_model, bagging_params, cv = 10, n_jobs=-1).fit(X_train,y_train)

In [416]:
print("Best Params:", tuned_bagging_model.best_params_)
print("Mean Squared Error - Train:", mean_squared_error(y_train, tuned_bagging_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, tuned_bagging_model.predict(X_test)))
print("R2 Score:", r2_score(y_test, tuned_bagging_model.predict(X_test)))

Best Params: {'n_estimators': 100}
Mean Squared Error - Train: 45219832.89413185
Mean Squared Error - Test: 2328188927.17764
R2 Score: 0.39210327005561685


# Random Forests

In [417]:
rf_model = RandomForestRegressor().fit(X_train, y_train)

In [418]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, rf_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, rf_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(rf_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, rf_model.predict(X_test)))

Mean Squared Error - Train: 46727335.30410666
Mean Squared Error - Test: 2347998422.37596
Cross Validation Score: 0.768035806801212
R2 Score: 0.3869309546939482


# Model Tuning - Random Forests

In [419]:
rf_params = {'max_depth': np.arange(1,10),
             'max_features': [1,3,5,10,15],
             'n_estimators' : [100, 200, 500, 1000, 2000]}

tuned_rf_model = GridSearchCV(rf_model, rf_params, cv = 10, n_jobs=-1).fit(X_train,y_train)

In [420]:
print("Best Params:", tuned_rf_model.best_params_)
print("Mean Squared Error - Train:", mean_squared_error(y_train, tuned_rf_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, tuned_rf_model.predict(X_test)))
print("R2 Score:", r2_score(y_test, tuned_rf_model.predict(X_test)))

Best Params: {'max_depth': 9, 'max_features': 1, 'n_estimators': 500}
Mean Squared Error - Train: 29614735.33222806
Mean Squared Error - Test: 2337812203.0727844
R2 Score: 0.389590605434751


# GBM

In [421]:
gbm_model = GradientBoostingRegressor().fit(X_train, y_train)

In [422]:
print("Mean Squared Error - Train:", mean_squared_error(y_train, gbm_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, gbm_model.predict(X_test)))
print("Cross Validation Score:", np.mean(cross_val_score(gbm_model, X_train, y_train, cv=10)))
print("R2 Score:", r2_score(y_test, gbm_model.predict(X_test)))

Mean Squared Error - Train: 8012710.862799334
Mean Squared Error - Test: 2084561722.14652
Cross Validation Score: 0.781801234590376
R2 Score: 0.4557150240396215


# Model Tuning - GBM

In [423]:
gbm_params = {
    "criterion": ["friedman_mse", "squared_error"],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'max_depth': [1, 3, 5, 8,50,100],
    'n_estimators': [100, 200, 500, 1000, 2000],
    'subsample': [1,0.5,0.75]}

tuned_gbm_model = GridSearchCV(gbm_model, gbm_params, cv = 10, n_jobs=-1).fit(X_train,y_train)

In [424]:
print("Best Params:", tuned_gbm_model.best_params_)
print("Mean Squared Error - Train:", mean_squared_error(y_train, tuned_gbm_model.predict(X_train)))
print("Mean Squared Error - Test:", mean_squared_error(y_test, tuned_gbm_model.predict(X_test)))
print("R2 Score:", r2_score(y_test, tuned_gbm_model.predict(X_test)))

Best Params: {'criterion': 'squared_error', 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}
Mean Squared Error - Train: 2816201.0592126497
Mean Squared Error - Test: 2069558723.4635723
R2 Score: 0.45963234953338183
