### Dependencies

In [25]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

### Data Preprocessing

In [26]:
# Once Data is cleaned, the code for preprocessing, loading the data will go here! '
# Importing cleaned data into dataframe
oil_prices_df = pd.read_csv("Resources_Clean/brent_oil_prices.csv")
electric_car_data_df = pd.read_csv("Resources_Clean/ecar_data_clean_me.csv")
electric_car_sales_df = pd.read_csv("Resources_Clean/ecar_sales_by_model_in_usa.csv")
charging_stations_df = pd.read_csv("Resources_Clean/ev_charging_stations.csv")

In [27]:
# Check and Drop Null Values
oil_prices_df.isnull().sum()

Unnamed: 0    0
date          0
price         0
dtype: int64

In [28]:
electric_car_data_df.isnull().sum()

Unnamed: 0          0
brand               0
model               0
accel_sec           0
top_speed_kmh       0
range_km            0
battery_pack_kwh    0
efficiency_whkm     0
fast_charge_kmh     0
rapid_charge        0
power_train         0
plug_type           0
body_style          0
price_euro          0
dtype: int64

In [29]:
electric_car_sales_df.isnull().sum()

Unnamed: 0     0
make           0
model          0
jan_12        45
feb_12        52
              ..
aug_19         1
sep_19         1
oct_19         1
nov_19         1
dec_19         0
Length: 99, dtype: int64

In [30]:
# Filling Null Values with 0 and checking
electric_car_sales_df.fillna(value = 0, inplace = True)
electric_car_sales_df.isnull().sum()

Unnamed: 0    0
make          0
model         0
jan_12        0
feb_12        0
             ..
aug_19        0
sep_19        0
oct_19        0
nov_19        0
dec_19        0
Length: 99, dtype: int64

In [31]:
charging_stations_df.isnull().sum()

Unnamed: 0          0
station_name        0
street_address      0
city                0
access_days_time    0
dtype: int64

### Splitting The Data into Training and Testing

In [32]:
# Filtering Oil price data
new_oil_prices_df = oil_prices_df.loc[(oil_prices_df['date'] > "2011-12")]
new_oil_prices_df
new_oil_prices_df['row_num'] = np.arange(len(new_oil_prices_df))
new_oil_prices_df = new_oil_prices_df.drop(labels=range(392, 405))
new_oil_prices_df = new_oil_prices_df.drop(labels=range(296,335))
new_oil_prices_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0.1,Unnamed: 0,date,price,row_num
335,335,2015-04,59.524286,39
336,336,2015-05,64.075,40
337,337,2015-06,61.477727,41
338,338,2015-07,56.561304,42
339,339,2015-08,46.515,43
340,340,2015-09,47.623182,44
341,341,2015-10,48.43,45
342,342,2015-11,44.267619,46
343,343,2015-12,38.005455,47
344,344,2016-01,30.6995,48


In [33]:
new_oil_prices_df.shape

(57, 4)

In [34]:
new_electric_car_sales_df = electric_car_sales_df.drop(columns=['make', 'model'], axis =1) 
new_electric_car_sales_df

Unnamed: 0.1,Unnamed: 0,jan_12,feb_12,mar_12,apr_12,may_12,jun_12,jul_12,aug_12,sep_12,...,mar_19,apr_19,may_19,jun_19,jul_19,aug_19,sep_19,oct_19,nov_19,dec_19
0,0,603.0,1626.0,3915.0,5377.0,7057.0,8817.0,10666.0,13497.0,16348.0,...,146667.0,147072.0,14748.0,147813.0,148063.0,148337.0,148687.0,148757.0,148907.0,149057
1,1,0.0,21.0,912.0,2566.0,3652.0,4347.0,5035.0,6082.0,7734.0,...,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345
2,2,676.0,1154.0,1733.0,2103.0,2613.0,3148.0,3543.0,4228.0,5212.0,...,122534.0,123485.0,124701.0,125857.0,126795.0,127912.0,12896.0,129847.0,130987.0,132214
3,3,0.0,0.0,0.0,0.0,0.0,12.0,31.0,74.0,160.0,...,147517.0,148342.0,149367.0,151117.0,152092.0,153142.0,154242.0,154992.0,156492.0,157992
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231
5,5,2.0,2.0,2.0,2.0,8.0,97.0,135.0,169.0,228.0,...,9242.0,9242.0,9242.0,9242.0,9242.0,9242.0,9242.0,9242.0,9242.0,9242
6,6,36.0,80.0,136.0,215.0,300.0,333.0,366.0,403.0,439.0,...,2028.0,2028.0,2028.0,2028.0,2028.0,2028.0,2028.0,2028.0,2028.0,2028
7,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,...,2472.0,2472.0,2472.0,2472.0,2472.0,2472.0,2472.0,2472.0,2472.0,2472
8,8,0.0,0.0,0.0,0.0,0.0,0.0,7.0,16.0,32.0,...,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069
9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,62774.0,63359.0,63964.0,64639.0,65359.0,65979.0,66679.0,67279.0,67879.0,68557


In [35]:
X = new_oil_prices_df[['row_num', 'price']]
y = new_electric_car_sales_df

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,)

### Testing Linear Regression Model

In [37]:
X_train = new_oil_prices_df[['row_num', 'price']]
y_test = new_electric_car_sales_df['jan_19']
model = LinearRegression()
model.fit(X_train, y_test)

y_pred = pd.Series(model.predict(X_train), index = X_train.index)

In [38]:
y_pred

335    71129.804579
336    77410.083301
337    71163.223508
338    60851.958491
339    41550.367240
340    41797.467532
341    41516.415772
342    32526.640037
343    19856.908067
344     5357.891513
345     6260.123402
346    15131.078416
347    19347.152433
348    26693.560111
349    27635.910768
350    20165.610624
351    20032.334036
352    19607.338415
353    23090.455887
354    13003.748965
355    26335.716796
356    26863.197055
357    25681.365903
358    18237.351066
359    17801.981293
360    12634.510873
361     4002.325361
362     6005.835648
363     9964.106732
364    16065.267081
365    16744.698692
366    24174.029619
367    25387.501005
368    31935.698047
369    23651.509633
370    23181.794206
371    32158.565507
372    38997.070185
373    32797.172414
374    30838.081853
375    26118.460881
376    35574.386983
377    37631.848015
378     7398.454807
379    -7240.544769
380    -5347.789969
381      932.909819
382     3055.035250
383    10288.779595
384     8741.649911


In [39]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.32456498245980203

### Feature Engineering

#### In order to see if we can achieve a higher r-squared value we are going to filter out some of the data in the electric car sales dataframe. We also changed the output testing data from January 2019 to December 2018.

In [59]:
feature_electric_car_sales_df = electric_car_sales_df.drop(columns=['make', 'model'], axis =1) 
feature_electric_car_sales_df.head()

Unnamed: 0.1,Unnamed: 0,jan_12,feb_12,mar_12,apr_12,may_12,jun_12,jul_12,aug_12,sep_12,...,mar_19,apr_19,may_19,jun_19,jul_19,aug_19,sep_19,oct_19,nov_19,dec_19
0,0,603.0,1626.0,3915.0,5377.0,7057.0,8817.0,10666.0,13497.0,16348.0,...,146667.0,147072.0,14748.0,147813.0,148063.0,148337.0,148687.0,148757.0,148907.0,149057
1,1,0.0,21.0,912.0,2566.0,3652.0,4347.0,5035.0,6082.0,7734.0,...,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345
2,2,676.0,1154.0,1733.0,2103.0,2613.0,3148.0,3543.0,4228.0,5212.0,...,122534.0,123485.0,124701.0,125857.0,126795.0,127912.0,12896.0,129847.0,130987.0,132214
3,3,0.0,0.0,0.0,0.0,0.0,12.0,31.0,74.0,160.0,...,147517.0,148342.0,149367.0,151117.0,152092.0,153142.0,154242.0,154992.0,156492.0,157992
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231


In [64]:
feature_electric_car_sales_df = feature_electric_car_sales_df.drop(feature_electric_car_sales_df.iloc[:,1:2], axis = 1)
feature_electric_car_sales_df.head()

Unnamed: 0.1,Unnamed: 0,jul_15,aug_15,sep_15,oct_15,nov_15,dec_15,jan_16,feb_16,mar_16,...,mar_19,apr_19,may_19,jun_19,jul_19,aug_19,sep_19,oct_19,nov_19,dec_19
0,0,72295.0,73675.0,74624.0,76659.0,78639.0,80753.0,81749.0,82875.0,84740.0,...,146667.0,147072.0,14748.0,147813.0,148063.0,148337.0,148687.0,148757.0,148907.0,149057
1,1,41576.0,41920.0,42136.0,42227.0,42271.0,42293.0,42303.0,42309.0,42316.0,...,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345.0,42345
2,2,73619.0,75012.0,76259.0,77497.0,78551.0,79898.0,80653.0,81583.0,82829.0,...,122534.0,123485.0,124701.0,125857.0,126795.0,127912.0,12896.0,129847.0,130987.0,132214
3,3,50189.0,51489.0,53989.0,55889.0,58591.0,62191.0,63041.0,64591.0,68581.0,...,147517.0,148342.0,149367.0,151117.0,152092.0,153142.0,154242.0,154992.0,156492.0,157992
4,4,22197.0,22920.0,23639.0,24334.0,24973.0,25552.0,25902.0,26392.0,27002.0,...,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231.0,42231


In [69]:
X = new_oil_prices_df[['row_num', 'price']]
y = feature_electric_car_sales_df

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,)

In [87]:
X_train = new_oil_prices_df[['row_num', 'price']]
y_test = feature_electric_car_sales_df['dec_18']
model = LinearRegression()
model.fit(X_train, y_test)

y_pred = pd.Series(model.predict(X_train), index = X_train.index)

In [88]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.3291715741052291

### Random Forest Regressor Model using electric car feature data

In [152]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [153]:
electric_car_data_df

Unnamed: 0.1,Unnamed: 0,accel_sec,top_speed_kmh,range_km,battery_pack_kwh,efficiency_whkm,fast_charge_kmh,rapid_charge,power_train,plug_type,body_style,price_euro
45,45,7.9,167,365,64.0,175,340,Yes,FWD,Type 2 CCS,SUV,36837
46,46,7.3,150,335,58.0,173,210,Yes,FWD,Type 2 CCS,MPV,41906
47,47,4.0,250,365,71.0,195,730,Yes,AWD,Type 2 CCS,Sedan,102945
48,48,10.0,150,575,60.0,104,540,Yes,AWD,Type 2 CCS,Liftback,149000
49,49,9.0,150,335,63.0,188,350,Yes,FWD,Type 2 CCS,SUV,36057
50,50,5.7,200,365,86.5,237,590,Yes,AWD,Type 2 CCS,SUV,79445
51,51,2.1,410,970,200.0,206,920,Yes,AWD,Type 2 CCS,Cabrio,215000
52,52,8.5,150,255,45.0,176,390,Yes,FWD,Type 2 CCS,SUV,35000
53,53,8.8,160,420,77.0,183,560,Yes,RWD,Type 2 CCS,SUV,40000
54,54,4.6,250,450,90.0,211,490,Yes,AWD,Type 2,SUV,85990


In [154]:
electric_car_data_df.shape

(57, 12)

In [196]:
X = electric_car_data_df[['range_km', 'battery_pack_kwh','fast_charge_kmh', 'price_euro']]
y = feature_electric_car_sales_df

In [197]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,)

In [198]:
X_train = electric_car_data_df[['range_km', 'battery_pack_kwh','fast_charge_kmh', 'price_euro']]
y_test = feature_electric_car_sales_df['jan_19']
model = RandomForestRegressor(max_depth=2)
model.fit(X_train,y_test)

RandomForestRegressor(max_depth=2)

In [199]:
y_pred = pd.Series(model.predict(X_train), index = X_train.index)

In [200]:
r2_score(y_test, y_pred)

0.4347680238295377