In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import plotly.graph_objects as go

In [2]:
data = pd.read_csv("./data/auto-mpg.csv", index_col="car name", na_values="?")

In [3]:
print(data.shape)
data = data.dropna()
print(data.shape)

(398, 8)
(392, 8)


## Raw data without transformation

In [4]:
factors = ['cylinders','displacement','horsepower','acceleration','weight','origin','model year']
X = pd.DataFrame(data[factors].copy())
y = data['mpg'].copy()

In [57]:
X_scaled = StandardScaler().fit_transform(X)

In [58]:
print(X_scaled)

[[ 1.48394702  1.07728956  0.66413273 ...  0.62054034 -0.71664105
  -1.62531533]
 [ 1.48394702  1.48873169  1.57459447 ...  0.84333403 -0.71664105
  -1.62531533]
 [ 1.48394702  1.1825422   1.18439658 ...  0.54038176 -0.71664105
  -1.62531533]
 ...
 [-0.86401356 -0.56847897 -0.53247413 ... -0.80463202 -0.71664105
   1.63640964]
 [-0.86401356 -0.7120053  -0.66254009 ... -0.41562716 -0.71664105
   1.63640964]
 [-0.86401356 -0.72157372 -0.58450051 ... -0.30364091 -0.71664105
   1.63640964]]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state=23)

In [6]:
print(X_train.shape)
print(X_test.shape)

(294, 7)
(98, 7)


## Linear model

In [7]:
regressor = LinearRegression()
regressor.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [8]:
regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
y_pred = regressor.predict(X_test)

In [31]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(len(y_train)), y=y_train, mode='markers+lines'))
fig.add_trace(go.Scatter(x=np.arange(len(y_train)), y=regressor.predict(X_train), mode='markers+lines'))

In [30]:
fig = go.Figure(data=go.Scatter(x=np.arange(len(y_test)), y=y_test.tolist(), mode='markers+lines'))
fig.add_trace(go.Scatter(x=np.arange(len(y_test)), y=y_pred.tolist(), mode='markers+lines'))
fig.show()

In [45]:
rmse = np.sqrt(mean_squared_error(y_true=y_test,y_pred=y_pred))
print(rmse)

3.4936768348677347


In [53]:
rmse = np.sqrt(np.mean((y_pred - np.array(y_test))**2))
print(rmse)

3.4936768348677347
