In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('./data/advertising.csv')
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [4]:
X = df.drop('sales', axis=1)
y = df['sales']

In [5]:
# polynomial regression
from sklearn.preprocessing import PolynomialFeatures

In [6]:
poly_converter = PolynomialFeatures(degree=2, include_bias=False)

In [7]:
poly_converter.fit(X)

PolynomialFeatures(include_bias=False)

In [11]:
poly_features = poly_converter.transform(X)

In [12]:
poly_features.shape

(200, 9)

In [10]:
X.shape

(200, 3)

In [16]:
X.iloc[0]

TV           230.1
radio         37.8
newspaper     69.2
Name: 0, dtype: float64

In [15]:
poly_features[0]

array([2.301000e+02, 3.780000e+01, 6.920000e+01, 5.294601e+04,
       8.697780e+03, 1.592292e+04, 1.428840e+03, 2.615760e+03,
       4.788640e+03])

In [17]:
X.iloc[0] ** 2

TV           52946.01
radio         1428.84
newspaper     4788.64
Name: 0, dtype: float64

In [18]:
poly_features = poly_converter.fit_transform(X)

### Polynomial Regression: Splitting, Training, and Evaluation

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
...     poly_features, y, test_size=0.3, random_state=101)

In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [23]:
test_predictions = model.predict(X_test)

In [24]:
model.coef_

array([ 5.17095811e-02,  1.30848864e-02,  1.20000085e-02, -1.10892474e-04,
        1.14212673e-03, -5.24100082e-05,  3.34919737e-05,  1.46380310e-04,
       -3.04715806e-05])

In [25]:
model.coef_.shape

(9,)

In [26]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [27]:
mae = mean_absolute_error(y_test, test_predictions)

In [28]:
mse = mean_squared_error(y_test, test_predictions)

In [29]:
rmse = np.sqrt(mse)

In [30]:
mae

0.48967980448037

In [31]:
rmse

0.6646431757269196

In [32]:
# !! in order to compare mae and rmse to our linear model, we must do the same split and shuffle !! 

In [33]:
poly_features[0]

array([2.301000e+02, 3.780000e+01, 6.920000e+01, 5.294601e+04,
       8.697780e+03, 1.592292e+04, 1.428840e+03, 2.615760e+03,
       4.788640e+03])

In [34]:
model.coef_

array([ 5.17095811e-02,  1.30848864e-02,  1.20000085e-02, -1.10892474e-04,
        1.14212673e-03, -5.24100082e-05,  3.34919737e-05,  1.46380310e-04,
       -3.04715806e-05])

### Adjusting Model Complexity

In [35]:
# 1. use different polynomial orders
# 2. split the poly features into training and testing sets
# 3. fit on training data
# 4. store/save the rmse for BOTH training and testing data
# 5. plot the results (error vs poly order)

In [37]:
train_rmse_errors = []
test_rmse_errors = []

for p in range(1, 10):
    
    temp_poly_converter = PolynomialFeatures(degree=p, include_bias=False)
    temp_poly_features = temp_poly_converter.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(temp_poly_features, y, test_size=0.3, random_state=101)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    
    train_rmse_errors.append(train_rmse)
    test_rmse_errors.append(test_rmse)

In [39]:
train_rmse_errors

[1.734594124329376,
 0.5879574085292233,
 0.4339344356902067,
 0.35170836883993534,
 0.2509342952029336,
 0.19933332834273104,
 5.4214215994181805,
 0.14237972100695595,
 0.16675080548552418]

In [40]:
test_rmse_errors

[1.5161519375993873,
 0.6646431757269196,
 0.5803286825231453,
 0.5077742624232109,
 2.5758247603435955,
 4.490868529265006,
 1381.404235838588,
 4449.5681972303655,
 95893.0265813161]