In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/quickest-electric-cars-ev-database/Quickestelectriccars-EVDatabase.csv')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()
# as the price poitns of these vehicles has nothing to do with their performance, we'll ignore the 
# null price values

# Price in UK

In [None]:
#If we hope to train a regression model to predict price point for electric vehicles, we'll have to clean 
# up the dat aa bit. 

# First, well have to handle the missing price data. As the price in UK column is missing more data,
#we'll drop this column and make price predictions based on the price in Germany column.

#We'll also drop the 'Name'

df.drop(['PriceinUK', 'Name'], axis =1 , inplace = True)
df.dropna(axis = 0, how = 'any', inplace = True)

# Price in Germany

In [None]:
df['PriceinGermany (euros)'] = df['PriceinGermany'].apply(lambda x:x.split('€')[1]).apply(lambda x:x.split(','))\
.apply(lambda x:''.join(x))
df.drop('PriceinGermany', axis = 1, inplace = True)
df['PriceinGermany (euros)'] = list(map(float, df['PriceinGermany (euros)']))

# Subtitle 

In [None]:
# Making the subtitle column useable
df['subtitle (kWh)'] = df['Subtitle'].apply(lambda x:x.split()[4])
df.drop('Subtitle', axis = 1, inplace = True)
df['subtitle (kWh)'] = list(map(float, df['subtitle (kWh)']))

# Acceleration ? (see comment below)

In [None]:
# I assume acceleration is really the 0 to 60 time in seconds???
df['acceleration (s)'] = df['Acceleration'].apply(lambda x:x.split()[0])
df.drop('Acceleration', axis = 1, inplace = True)
df['acceleration (s)'] = list(map(float, df['acceleration (s)']))

# top speed

In [None]:
df['topspeed (km/h)'] = df['TopSpeed'].apply(lambda x:x.split()[0])
df.drop('TopSpeed', axis = 1, inplace = True)
df['topspeed (km/h)'] = list(map(float, df['topspeed (km/h)']))

# range

In [None]:
df['range (km)'] = df['Range'].apply(lambda x:x.split()[0])
df.drop('Range', axis = 1, inplace = True)
df['range (km)'] = list(map(float, df['range (km)']))

# Efficiency

In [None]:
df['efficiency (Wh/km)'] = df['Efficiency'].apply(lambda x:x.split()[0])
df.drop('Efficiency', axis = 1, inplace = True)
df['efficiency (Wh/km)'] = list(map(float, df['efficiency (Wh/km)']))

# fast charge speed?

In [None]:
# will have to look up what fast charge speed corresponds to
df['fastchargespeed (km/h)'] = df['FastChargeSpeed'].apply(lambda x:x.split()[0])
df.drop('FastChargeSpeed', axis = 1, inplace = True)
df.drop(df[df['fastchargespeed (km/h)'] == '-'].index, axis = 0, inplace = True)
df['fastchargespeed (km/h)'] = list(map(float, df['fastchargespeed (km/h)']))

# Drive

In [None]:
drive_dummies = pd.get_dummies(df['Drive'],drop_first= True)
df.drop('Drive', axis = 1, inplace = True)
df = pd.concat([df, drive_dummies], axis = 1)

# EDA

In [None]:
plt.figure(figsize= (8,6))
sns.heatmap(df.corr(), annot = True, cmap = 'Spectral')
# some interesting correlations to explore here

In [None]:
# price vs. energy 
sns.scatterplot(x = 'PriceinGermany (euros)', y = 'subtitle (kWh)', data =df, hue = 'Front Wheel Drive')

In [None]:
# price vs. acceleration 
sns.scatterplot(x = 'PriceinGermany (euros)', y = 'acceleration (s)', data =df, hue = 'Front Wheel Drive')

In [None]:
# price vs. acceleration 
sns.scatterplot(x = 'PriceinGermany (euros)', y = 'topspeed (km/h)', data =df, hue = 'Front Wheel Drive')

In [None]:
# negative correlation between top speed and acceleration. 
# The front wheel drive cars in the dataset have the greatest acceleration
# and the lowest top speed
sns.scatterplot(x = 'acceleration (s)', y = 'topspeed (km/h)', data = df,\
                hue = 'Front Wheel Drive')

In [None]:
# strong positive correlation between top speed and range of vehicle. 
# front wheel drive vehicles have the least amount of range (and lowest top speeds)
sns.scatterplot(x = 'range (km)', y = 'topspeed (km/h)', data = df,   hue = 'Front Wheel Drive')

In [None]:
# strong negative correlation between range and top speed

sns.scatterplot(x = 'range (km)', y = 'acceleration (s)', data =df, hue = 'Front Wheel Drive' )

In [None]:
sns.countplot(x = 'NumberofSeats', data = df, hue = 'Rear Wheel Drive')

In [None]:
# Positive correlation between the number of seats and efficieny. This is an interesting correlation 
# that, energeticlaly, one would expect to be inverted. 
# Do car manufacturers account for the number of seats in the efficiency rating?
# One would also expect vehicles with larger efficiencies will have a greater range...
sns.violinplot(x = 'NumberofSeats', y = 'efficiency (Wh/km)', data =df)

In [None]:
# somehow efficiency does not correlate with range. 

# it makes one wonder, is the efficiency quantification process the same for all vehicles? 

sns.scatterplot(x = 'efficiency (Wh/km)', y = 'range (km)', data = df, hue = 'NumberofSeats')

In [None]:
# postive correlation between fast charge speed and range. 
# front wheel drive vehicles make up the bottom of the pack in both features. 
sns.scatterplot(x = 'fastchargespeed (km/h)', y = 'range (km)', data = df, hue = 'Front Wheel Drive')

In [None]:
# negative correlation between fast charge speed and acceleration
sns.scatterplot(x = 'fastchargespeed (km/h)', y = 'acceleration (s)', data = df, hue = 'Front Wheel Drive')

In [None]:
# postive correlation between fast charge speed and topspeed
sns.scatterplot(x = 'fastchargespeed (km/h)', y = 'topspeed (km/h)', data = df, hue = 'Front Wheel Drive')

# Price point prediction

In [None]:
# We'll train several regression models on this dataset

# Dataset split

In [None]:
X = df.drop('PriceinGermany (euros)', axis = 1)
y = df['PriceinGermany (euros)']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test)

In [None]:
sns.scatterplot(x = lr_pred, y = y_test)

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, lr_pred))
print('MSE:', metrics.mean_squared_error(y_test, lr_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, lr_pred)))
print('r2:', metrics.r2_score(y_test, lr_pred))

#reported errors in euros
#we'll use the MAE for comparison between models

lr_MAE = metrics.mean_absolute_error(y_test, lr_pred)
lr_r2 =  metrics.r2_score(y_test, lr_pred)

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_pred = dtr.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, dtr_pred))
print('MSE:', metrics.mean_squared_error(y_test, dtr_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, dtr_pred)))
print('r2:', metrics.r2_score(y_test, dtr_pred))


dtr_MAE = metrics.mean_absolute_error(y_test, dtr_pred)
dtr_r2 =  metrics.r2_score(y_test, dtr_pred)

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_pred = rfr.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, rfr_pred))
print('MSE:', metrics.mean_squared_error(y_test, rfr_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rfr_pred)))
print('r2:', metrics.r2_score(y_test, rfr_pred))


rfr_MAE = metrics.mean_absolute_error(y_test, rfr_pred)
rfr_r2 = metrics.r2_score(y_test, rfr_pred)

# XGBoost Regressor

In [None]:
from xgboost import XGBRegressor
xr = XGBRegressor()
xr.fit(X_train, y_train)
xr_pred = xr.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, xr_pred))
print('MSE:', metrics.mean_squared_error(y_test, xr_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, xr_pred)))
print('r2:', metrics.r2_score(y_test, xr_pred))


xr_MAE = metrics.mean_absolute_error(y_test, xr_pred)
xr_r2 = metrics.r2_score(y_test, xr_pred)

# Results

In [None]:
results = pd.DataFrame({'Model': ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor',\
                                  'XGBoost Regressor'], 'MAE': [lr_MAE, dtr_MAE, rfr_MAE, xr_MAE],\
                        'r2':[lr_r2, dtr_r2, rfr_r2, xr_r2] })

results

# No fantastic model performance. Random Forest Regressor Wins!