# Linear regression

Now that we have our linear expression 

$$
Y = \beta_0 + \sum_{i = 1} \beta_i X_i
$$

we can attempt to find the values for each $\beta$.

In [1]:
import pandas as pd

df_adv = pd.read_csv("./advertising-data.csv")
print(df_adv)

         TV  Billboards  Google_Ads  Social_Media  Influencer_Marketing  \
0    281.42      538.80      123.94        349.30                242.77   
1    702.97      296.53      558.13        180.55                781.06   
2    313.14      295.94      642.96        505.71                438.91   
3    898.52       61.27      548.73        240.93                278.96   
4    766.52      550.72      651.91        666.33                396.33   
..      ...         ...         ...           ...                   ...   
395  873.32      234.67      563.21        401.88                789.44   
396  421.88      449.22      564.77        334.88                689.77   
397  599.22      789.88      831.32        450.88                398.44   
398  784.33      500.67      958.88        635.22                786.44   
399  290.67      321.88      640.56        765.22                409.77   

     Affiliate_Marketing  Product_Sold  
0                 910.10        7164.0  
1                

Instead of trying to find all these at once we will just start with `Social_Media`.

In [2]:
import numpy as np

cost_social_media = df_adv["Social_Media"].to_numpy()
product_sold = df_adv["Product_Sold"].to_numpy()

print(product_sold.shape)
print(product_sold[:10])

(400,)
[7164. 5055. 6154. 5480. 9669. 7627. 5177. 3726. 9801. 8652.]


## SciPy

In [3]:
def linear_model(betas, x):
    """Generic linear expression"""
    beta_0, beta_1 = betas
    y = beta_0 + x * beta_1
    return y

In [4]:
def mse(betas, x_data, y_data):
    """Compute mean squared error of a linear model."""
    # Compute squared error
    beta_0, beta_1 = betas
    y_pred = beta_0 + beta_1 * x_data

    # Square and average errors
    squared_errors = (y_pred - y_data) ** 2
    mse = squared_errors.mean()

    return mse

In [5]:
from scipy.optimize import minimize

betas_guess = [1.0, 1.0]
res = minimize(mse, betas_guess, (cost_social_media, product_sold))
betas_scipy = res.x

print(f"beta_0 = {betas_scipy[0]:.3f}")
print(f"beta_1 =    {betas_scipy[1]:.3f}")

beta_0 = 5932.463
beta_1 =    2.418


## sklearn


In [6]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X=df_adv["Social_Media"], y=df_adv["Product_Sold"])

ValueError: Expected 2D array, got 1D array instead:
array=[349.3  180.55 505.71 240.93 666.33 142.96 271.62  97.85 759.04 329.15
 970.28 992.3  626.87 102.96 358.88 153.75 531.53 609.33  74.83  31.38
 347.92 252.3  145.03 968.13 895.59 695.95 855.9  800.94 580.06 793.12
 526.25 316.48 834.98 213.33 887.1  758.   170.31 304.15 664.95 573.63
  35.43 471.67 298.49 109.31  32.52 624.03 155.25 586.17 276.39 786.06
 470.89 192.2  257.88 829.   206.61 230.22 254.77 393.81 555.73 496.81
 910.23 901.98 407.08 557.15 353.72 767.46  11.69 369.57 873.18 619.22
 917.84 694.84 918.84 438.44 725.44 595.24 837.4  162.85 535.22 630.16
 657.96 593.97 362.43 888.42 539.74 646.87 679.08  50.03 689.61 161.65
 100.35 285.48 373.02 361.36 352.17 595.41 235.03 597.94 421.19  29.11
 744.64 764.15 477.59 531.5  117.63 221.03 314.81 884.94 762.53 379.17
 681.03  48.06 851.19 588.1   63.47 199.95 934.6  547.92 349.42 411.67
 443.14 653.83 360.1  189.84 810.67 320.03 408.09 663.6  485.14 401.46
 574.66 510.64 662.46 809.05 487.63 588.21 144.69 679.19 317.25 425.96
 734.   562.54 569.2  949.21  57.73  55.67  48.26 961.58 429.08 779.52
 906.55 644.91 914.94 552.05 794.89 753.24 440.71 160.58 484.32 436.82
  95.81  44.73  79.95 907.89 406.79 731.43 888.38 277.28 907.29 152.38
 576.07 300.17 364.03 559.48 203.09 512.33 172.98 313.47 857.64 949.07
 230.73 116.79 531.67 296.04 381.83 222.02 642.39 329.66 601.86 905.36
 735.75 620.32 655.5  385.05 944.26 686.41 622.36  40.6  461.4  228.75
 455.   283.05  18.49 464.89 651.29 510.37 579.99  37.54 228.98  99.33
 828.4  656.   479.65  77.17 208.8  650.2  654.59 332.31  48.41 617.71
 578.32 236.32 589.08 299.63 823.89 886.41  91.9  728.87 711.14 477.86
 493.29 581.93 354.38 953.58 583.57 384.3  398.59 284.39 325.73 496.54
 289.98 769.67 962.41 946.4  410.78 454.6  184.65 166.23 226.57  39.88
 660.73 123.44 472.84 475.05 151.49 996.16 268.19  48.   709.57  28.84
 877.91 589.36 295.79 416.52 753.44 584.88 557.25 466.47 244.74 677.43
 578.7  861.16 811.9  852.62 334.76  42.51 723.51 121.48 202.29 712.51
 891.25 217.72 229.34 948.97 986.35 872.34 362.08 868.58 852.48 932.19
 587.2  360.38 622.86 979.08 555.71 172.26 366.95  60.61 343.65 802.3
 431.15 837.32 729.18 114.52 753.47 281.77 325.04 920.04 603.16 793.14
 460.89 401.89 244.39 761.02 588.15 629.94 756.21 322.58 897.83 392.8
 724.99 720.85 892.34 331.55 512.61 904.64 226.84 245.09 953.77 764.22
 552.67 716.33 371.62 564.83 803.65 789.12 423.27 543.34 212.45 347.51
 587.61 723.46 847.77 474.62 408.78 238.32 685.13 982.15 642.81 854.62
 493.23 420.32 321.87 459.12 189.04 632.73 490.32 280.15 125.49 698.12
 287.33 745.36 680.28 301.79 120.67 421.01 189.78 708.49 582.33 121.33
  52.14 409.28 180.45 189.74 890.67 823.09 643.98 759.88 689.22 459.76
 632.21 214.77 798.12 510.32 875.43 289.12 430.67 760.77 889.12 121.44
 360.32 697.44 543.22 765.32 208.88 401.88 334.88 450.88 635.22 765.22].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

TODO:

In [7]:
cost_social_media = cost_social_media.reshape(-1, 1)
product_sold = product_sold.reshape(-1, 1)

print("\nNew shape")
print(product_sold.shape)
print(product_sold[:10])


New shape
(400, 1)
[[7164.]
 [5055.]
 [6154.]
 [5480.]
 [9669.]
 [7627.]
 [5177.]
 [3726.]
 [9801.]
 [8652.]]


In [8]:
reg.fit(X=cost_social_media, y=product_sold)

print(f"beta_0 = {reg.intercept_[0]:.3f}")
print(f"beta_1 =    {reg.coef_[0][0]:.3f}")

beta_0 = 5932.473
beta_1 =    2.418


## Acknowledgements

Much of this material has been adapted from [An Introduction to Statistical Learning](https://www.statlearning.com/) to be more digestible for undergraduates starting in Biology.