# Vehicle Range Prediction

In [1]:
# minzhou@bu.edu

In [2]:
import os
import json
from ast import literal_eval
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Load data

In [3]:
data_folder = 'BU-SPARK-DATA'

def save_all_json_file_paths():
    json_files = {}
    for folder in os.listdir(data_folder):
        if 'app-results' in folder:
            app_folder = os.path.join(data_folder, folder)
            json_files[folder] = [os.path.join(app_folder, pos_json) 
                                  for pos_json in os.listdir(app_folder) if pos_json.endswith('.json')]
    return json_files 

In [4]:
json_files = save_all_json_file_paths()

In [5]:
json_files['app-results-1'][0]

'BU-SPARK-DATA/app-results-1/729.json'

## 2. Select the input variables

HEU and HPU specifications:
* system_cost
* system_weight
* num_HEU
* num_HPU
- HEU_nominal_energy
- HPU_nominal_energy
- HEU_cost (cost per cell)
- HPU_cost (cost per cell)
- HEU_weight (cell_mass)
- HPU_weight (cell_mass)
- HEU_max_power
- HPU_max_power
* dP_threshold

vehicle input variables:

- chassis_Mass_Min_Battery_Mass
- drag_Resistance
- frontal_area
- rolling_resistance
- power_Train_Eff


## Output
* system_range

In [182]:
feature_list = []
for result in json_files:
    for file_name in json_files[result]:
        try:
            with open(file_name) as data_file:
                json_file = json.load(data_file)
                python_dict = literal_eval(json_file)
                df = pd.DataFrame.from_dict(python_dict)
                python_dict_input = json.loads(df.input[0])
                
                # HPU and HEU specifications
                df_hpu_heu = pd.DataFrame(python_dict_input, columns=['HPU', 'HEU'])
                df_hpu_heu_series = df_hpu_heu.T[['cost_per_cell', 'nominal_energy', 'cell_mass']].stack()
                
                # vehicle input variables
                df_vehicle = pd.DataFrame(python_dict_input, columns=['vehicle'])
                df_vehicle_series = df_vehicle.T[['chassisMassMinBatteryMass', 'dragResistance', 
                                                  'frontalArea', 'rollingResistance', 'powerTrainEff']].stack()
                # system specifications
                df_spec_series = df.specifications[['system_cost', 'system_weight', 
                                                    'HEU_max_power', 'HPU_max_power', 'num_HEU', 
                                                    'num_HPU', 'dP_threshold', 'system_range']]
                # concate all features
                feature_series = pd.concat([df_spec_series, df_hpu_heu_series, df_vehicle_series])
                feature_list.append(feature_series)
        except:
            continue

In [183]:
feature_df = pd.concat(feature_list, axis=1, sort=False)

In [189]:
feature_df.T

Unnamed: 0,system_cost,system_weight,HEU_max_power,HPU_max_power,num_HEU,num_HPU,dP_threshold,system_range,"(HPU, cost_per_cell)","(HPU, nominal_energy)","(HPU, cell_mass)","(HEU, cost_per_cell)","(HEU, nominal_energy)","(HEU, cell_mass)","(vehicle, chassisMassMinBatteryMass)","(vehicle, dragResistance)","(vehicle, frontalArea)","(vehicle, rollingResistance)","(vehicle, powerTrainEff)"
0,7783.88,259.74,34.4589,0,3330,0,16.6781,300.418,45.7,0,510,1.87,34.1358,60,1166,0.29,2.24,0.015,84
1,10788.5,327.678,69.9118,9.13874,3980,26,9.62739,160.263,45.7,0.0671884,510,1.87,40.799,60,2624,0.3,5.022,0.015,84
2,7783.88,259.74,34.4589,0,3330,0,16.6781,300.418,45.7,0,510,1.87,34.1358,60,1166,0.29,2.24,0.015,84
3,13933.4,383.396,64.4257,9.84278,4690,52,8.35838,160.101,45.7,0.0539417,260,1.87,48.0772,60,2624,0.3,5.022,0.015,84
4,7783.88,259.74,34.4589,0,3330,0,16.6781,300.418,45.7,0,510,1.87,34.1358,60,1166,0.29,2.24,0.015,84
5,16325.5,473.824,53.4813,9.96093,5860,46,7.63938,400.447,45.7,0.0594359,280,1.87,60.0709,60,1756,0.24,2.34,0.015,84
6,11130,376.194,42.9906,0,7420,0,20.8534,170.198,45.7,0,510,1.2,22.8944,39,1542,0.28,2.15,0.015,84
7,7783.88,259.74,34.4589,0,3330,0,16.6781,300.418,45.7,0,510,1.87,34.1358,60,1166,0.29,2.24,0.015,84
8,9724,324.48,36.5705,0,4160,0,23.444,290.688,45.7,0,510,1.87,42.6442,60,1542,0.28,2.15,0.015,84
9,6628,211.224,31.1488,2.81192,2640,8,4.7955,200.668,45.7,0.0206734,510,1.87,27.0626,60,1166,0.29,2.24,0.015,84


In [231]:
pd.options.display.float_format = '{:,.4f}'.format
df = feature_df.T
corr = df.corr()
print(corr)
# corr[np.abs(corr) < 0.65] = 0
# plt.figure(figsize=(16,10))
# sns.heatmap(corr, annot=True, cmap='YlGnBu')
# plt.show()

Empty DataFrame
Columns: []
Index: []


### Convert to array and prepare training and test data

In [254]:
X = feature_df.T.drop(['system_range'], axis=1).values.astype(float)
y = feature_df.T['system_range'].values.astype(float)
print(X.shape, y.shape)

(432, 18) (432,)


## 3. Split the dataset to training and testing set

In [255]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## 4.1 SVM regressor

In [256]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [257]:
from sklearn.svm import SVR

clf = SVR()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Regression metrics
print('\nThe MAE is: ')
print(mean_absolute_error(y_test, y_pred))
print('\nThe MSE is: ')
print(mean_squared_error(y_test, y_pred))
print('\nThe r2_score is: ')
print(r2_score(y_test, y_pred))


The MAE is: 
29.797337714574045

The MSE is: 
3031.1917802911485

The r2_score is: 
0.38790682141483834


## 4.2 Random Forest regressor

In [258]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 500, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)
# Use the forest's predict method on the test data
y_pred = rf.predict(X_test)

# Regression metrics
print('\nThe MAE is: ')
print(mean_absolute_error(y_test, y_pred))
print('\nThe MSE is: ')
print(mean_squared_error(y_test, y_pred))
print('\nThe r2_score is: ')
print(r2_score(y_test, y_pred))


The MAE is: 
3.562427606812576

The MSE is: 
305.23517745621524

The r2_score is: 
0.9383633951503936


## 4.3 K Neighbors Regressor

In [218]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Regression metrics
print('\nThe MAE is: ')
print(mean_absolute_error(y_test, y_pred))
print('\nThe MSE is: ')
print(mean_squared_error(y_test, y_pred))
print('\nThe r2_score is: ')
print(r2_score(y_test, y_pred))


The MAE is: 
4.7177975324490555

The MSE is: 
336.37341064324727

The r2_score is: 
0.9320756042389436


## 4.4 Decision Tree Regressor

In [223]:
from sklearn.tree import DecisionTreeRegressor

regr = DecisionTreeRegressor(max_depth=5)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

# Regression metrics
print('\nThe MAE is: ')
print(mean_absolute_error(y_test, y_pred))
print('\nThe MSE is: ')
print(mean_squared_error(y_test, y_pred))
print('\nThe r2_score is: ')
print(r2_score(y_test, y_pred))


The MAE is: 
2.918647976393955

The MSE is: 
296.7287726562121

The r2_score is: 
0.9400811064434302


## 4.5 Bayesian Ridge

In [224]:
from sklearn.linear_model import BayesianRidge

clf = BayesianRidge(compute_score=True)
clf.fit(X, y)
y_pred = clf.predict(X_test)

# Regression metrics
print('\nThe MAE is: ')
print(mean_absolute_error(y_test, y_pred))
print('\nThe MSE is: ')
print(mean_squared_error(y_test, y_pred))
print('\nThe r2_score is: ')
print(r2_score(y_test, y_pred))


The MAE is: 
11.634484234977704

The MSE is: 
315.0096338242673

The r2_score is: 
0.936389624270516


## 4.6 OLS from sklearn

In [226]:
from sklearn.linear_model import LinearRegression

ols = LinearRegression()
ols.fit(X, y)
y_pred = ols.predict(X_test)

# Regression metrics
print('\nThe MAE is: ')
print(mean_absolute_error(y_test, y_pred))
print('\nThe MSE is: ')
print(mean_squared_error(y_test, y_pred))
print('\nThe r2_score is: ')
print(r2_score(y_test, y_pred))


The MAE is: 
10.705200871038556

The MSE is: 
247.6264987031455

The r2_score is: 
0.9499964035008817


## 4.7 OLS from statsmodels

In [259]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

model = sm.OLS(y, X)
lin_reg = model.fit()

lin_reg.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.932
Model:,OLS,Adj. R-squared:,0.929
Method:,Least Squares,F-statistic:,334.4
Date:,"Wed, 05 Dec 2018",Prob (F-statistic):,1.67e-229
Time:,22:19:40,Log-Likelihood:,-1865.8
No. Observations:,432,AIC:,3768.0
Df Residuals:,414,BIC:,3841.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0041,0.001,-3.747,0.000,-0.006,-0.002
x2,0.2381,0.133,1.795,0.073,-0.023,0.499
x3,-0.3674,0.350,-1.049,0.295,-1.056,0.321
x4,-0.1519,0.482,-0.315,0.753,-1.100,0.796
x5,-0.0036,0.006,-0.596,0.552,-0.015,0.008
x6,0.0400,0.023,1.715,0.087,-0.006,0.086
x7,1.2854,0.307,4.182,0.000,0.681,1.890
x8,-0.0156,0.199,-0.078,0.938,-0.408,0.376
x9,-0.2138,2.349,-0.091,0.928,-4.832,4.404

0,1,2,3
Omnibus:,117.275,Durbin-Watson:,2.124
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3993.269
Skew:,-0.366,Prob(JB):,0.0
Kurtosis:,17.877,Cond. No.,149000000.0
