In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


df = pd.read_csv("../data/Advertising.csv" , index_col= 0)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB


In [5]:
# 200 sample  - each row is a sample,  each sample is a point
df.shape

(200, 4)

In [6]:
X , y = df.drop("Sales" , axis= "columns"), df["Sales"]

# X matrix - feathure matirx
# each column is a feature
# TV -> x1, Radio  -> x2 , Newspaper -> x3

X


Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4
...,...,...,...
196,38.2,3.7,13.8
197,94.2,4.9,8.1
198,177.0,9.3,6.4
199,283.6,42.0,66.2


In [10]:
# vector  of labels , - the variable that we predict , the answers 

y

1      22.1
2      10.4
3       9.3
4      18.5
5      12.9
       ... 
196     7.6
197     9.7
198    12.8
199    25.5
200    13.4
Name: Sales, Length: 200, dtype: float64

## Multiple linear regression

$y = w_0 + w_1x_1 + w_2x_2 + w_3x_3$

- goal is to estimate  $w_i$ , .$i\in\{0,1,2,3\}$
- we use scikit-learn to do this 

## Scikit-Learn
Steps: 
1. train|test split - some cases train|validation|test - split
2. Scale the dataset 
    - many algorithms require scaling, some don't
    - which type of scaling to use?
    - scale training data, test data to the training data, to avoid data leakage
3. Fit the algorithm to the training data
4. Transform the training data, transform the test data
5. Calculate evaluation metrics

### 1: Train/Test split

In [23]:
from sklearn.model_selection import train_test_split

#help(train_test_split)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


# check that it adds up to  200
print(f"{X_train.shape =}")
print(f"{y_train.shape =}")
print(f"{X_test.shape =}")
print(f"{y_test.shape =}")

X_train.shape =(134, 3)
y_train.shape =(134,)
X_test.shape =(66, 3)
y_test.shape =(66,)


In [24]:
X_train.head(10)


Unnamed: 0,TV,Radio,Newspaper
43,293.6,27.7,1.8
190,18.7,12.1,23.4
91,134.3,4.9,9.3
137,25.6,39.0,9.3
52,100.4,9.6,3.6
128,80.2,0.0,9.2
163,188.4,18.1,25.6
42,177.0,33.4,38.7
119,125.7,36.9,79.2
114,209.6,20.6,10.7


In [25]:
y_train.head()

43     20.7
190     6.7
91     11.2
137     9.5
52     10.7
Name: Sales, dtype: float64

### 2: Feature scaling

Scaling of data is required for many algorithms 
normalization (min-max)

  
$X' = \frac{X-X{min}}{X{max}-X_{min}}$

feature standardization (standard normal distribution)
   -  $X' = \frac{X - \mu}{\sigma}$


In [26]:
from sklearn.preprocessing import MinMaxScaler

# instaniate a scaler instance from the class MinMaxScaler
scaler = MinMaxScaler()

#finds min and max from X_train and store them 
scaler.fit(X_train)
scaler

In [27]:
# uses the min and max from X_train to 

# transfrom X_train -> scaler X-train
scaled_X_train = scaler.transform(X_train)

# transfrom X_train -> scaler X_test
scaled_X_test = scaler.transform(X_test)

#same shape as before
print(f"{scaled_X_train.shape = }")
print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")

scaled_X_train.shape = (134, 3)
scaled_X_train.min() = 0.0
scaled_X_train.max() = 1.0


In [28]:
print(f"{scaled_X_test.shape = }")
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")


# note that for scaled_X_test -> min is not 0 and max is not 1
# this is becasue we use min and max from train data not from test data
# we have not leaked data now as we used X_train for this fit 


scaled_X_test.shape = (66, 3)
scaled_X_test.min() = 0.005964214711729622
scaled_X_test.max() = 1.1302186878727631


### 3: Fit algorithm (linear resgression) to training data

- train our model -> estimated $w_i$ from training data

In [32]:
from sklearn.linear_model import LinearRegression

# instantiate a model instance from LinearRegression class
model = LinearRegression()

# trains the model based on scaled_X_train
# it finds paprameters that defines a plane 
# that minimize the toal error between the points and the plane
model.fit(scaled_X_train, y_train)
print(f"{model.coef_ = }")
print(f"{model.intercept_ = }")

model.coef_ = array([13.20747617,  9.75285112,  0.61108329])
model.intercept_ = 2.7911595196243653


In [33]:
scaled_X_test[0]

array([0.54988164, 0.63709677, 0.52286282])

In [34]:
test_sample_feature = scaled_X_test[0]
test_sample_label = y_test.values[0]

print(f"{test_sample_feature = }")
print(f"{test_sample_label = }")

model.predict(test_sample_feature.reshape(1,-1))

test_sample_feature = array([0.54988164, 0.63709677, 0.52286282])
test_sample_label = 16.9


array([16.58673085])

### 4: Predict on test_data

In [14]:
y_pred = model.predict(scaled_X_test)
y_pred

array([16.58673085, 21.18622524, 21.66752973, 10.81086512, 22.25210881,
       13.31459455, 21.23875284,  7.38400509, 13.43971113, 15.19445383,
        9.01548612,  6.56945204, 14.4156926 ,  8.93560138,  9.56335776,
       12.10760805,  8.86091137, 16.25163621, 10.31036304, 18.83571624,
       19.81058732, 13.67550716, 12.45182294, 21.58072583,  7.67409148,
        5.67090757, 20.95448184, 11.89301758,  9.13043149,  8.49435255,
       12.32217788,  9.99097553, 21.71995241, 12.64869606, 18.25348116,
       20.17390876, 14.20864218, 21.02816483, 10.91608737,  4.42671034,
        9.59359543, 12.53133363, 10.14637196,  8.1294087 , 13.32973122,
        5.27563699,  9.30534511, 14.15272317,  8.75979349, 11.67053724,
       15.66273733, 11.75350353, 13.21744723, 11.06273296,  6.41769181,
        9.84865789,  9.45756213, 24.32601732,  7.68903682, 12.30794356,
       17.57952015, 15.27952025, 11.45659815, 11.12311877, 16.60003773,
        6.90611478])

In [15]:
y_test.to_numpy()

array([16.9, 22.4, 21.4,  7.3, 24.7, 12.6, 22.3,  8.4, 11.5, 14.9,  9.5,
        8.7, 11.9,  5.3, 10.3, 11.7,  5.5, 16.6, 11.3, 18.9, 19.7, 12.5,
       10.9, 22.2,  9.3,  8.1, 21.7, 13.4, 10.6,  5.7, 10.6, 11.3, 23.7,
        8.7, 16.1, 20.7, 11.6, 20.8, 11.9,  6.9, 11. , 12.8, 10.1,  9.7,
       11.6,  7.6, 10.5, 14.6, 10.4, 12. , 14.6, 11.7,  7.2,  6.6,  9.4,
       11. , 10.9, 25.4,  7.6, 11.7, 15. , 15.5, 11.9,  9.2, 12.8,  6.6])


<div style="max-width:66ch;">

## 5. Evaluate performance

How well did we predict $\bf{y}$ (label) with $\hat{\bf{y}}$ (y_pred)?

To answer this question we use several **evaluation metrics** or **loss functions**: 

- Mean Absolute Error (MAE) - mean of error between $\bf{y}$ and ${\hat{\bf{y}}}$. The unit is same as measured quantity.

$$MAE = \frac{1}{m}\sum_{i=1}^m |y_i - \hat{y}_i|$$

- Mean Squared Error (MSE) - mean of squared errors between $\bf{y}$ and ${\hat{\bf{y}}}$. It punishes large errors, and the units are in square units of the measured quantity

$$MSE = \frac{1}{m}\sum_{i=1}^m (y_i - \hat{y}_i)^2$$

- Root Mean Squared Error (RMSE) - square root of MSE between $\bf{y}$ and ${\hat{\bf{y}}}$. It punishes large errors, and the units are same as measured quantity, hence easier to interpret.

$$RMSE = \sqrt{\frac{1}{m}\sum_{i=1}^m (y_i - \hat{y}_i)^2}$$




</div>

In [16]:
from sklearn.metrics import mean_absolute_error , mean_squared_error


# MAE
mean_absolute_error(y_test , y_pred)

1.4937750024728977

In [17]:
mean_squared_error(y_test, y_pred)

3.72792833068152

In [18]:
np.sqrt(mean_squared_error(y_test, y_pred))

1.9307843822347228