In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

print('Pandas version :', pd.__version__)
print('Numpy version  :', np.__version__)
print('Matplot version:', matplotlib.__version__);
print('Seaborn version:', sns.__version__)
print('Sklearn version:', sk.__version__)

print('--------------')
print(pd.Timestamp.now())

Pandas version : 2.2.3
Numpy version  : 2.2.2
Matplot version: 3.10.0
Seaborn version: 0.13.2
Sklearn version: 1.6.1
--------------
2025-02-21 11:52:34.411069


In [2]:
df = pd.read_csv('../Data/Advertising.csv', index_col=0)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [3]:
y=df.Sales
X=df.drop('Sales', axis=1)
X.head()

Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [4]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

pre_process = PolynomialFeatures(degree=3, include_bias=False)
X_poly = pre_process.fit(X)
poly_feature_names = X_poly.get_feature_names_out(X.columns)
X_poly = pre_process.transform(X)

In [5]:
X.shape, X_poly.shape, poly_feature_names

((200, 3),
 (200, 19),
 array(['TV', 'Radio', 'Newspaper', 'TV^2', 'TV Radio', 'TV Newspaper',
        'Radio^2', 'Radio Newspaper', 'Newspaper^2', 'TV^3', 'TV^2 Radio',
        'TV^2 Newspaper', 'TV Radio^2', 'TV Radio Newspaper',
        'TV Newspaper^2', 'Radio^3', 'Radio^2 Newspaper',
        'Radio Newspaper^2', 'Newspaper^3'], dtype=object))

In [6]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=101)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((140, 19), (60, 19), (140,), (60,))

In [7]:
print('Mean before scaling: %8.5f' %X_train.mean())
print('Std  before scaling: %8.5f' %X_train.std())

Mean before scaling: 479106.02338
Std  before scaling: 2234807.09059


In [8]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# fit based ONLY on training features 
scaler.fit(X_train)

# transform both training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
print('Mean AFTER scaling: %8.5f' %X_train_scaled.mean())
print('Std  AFTER scaling: %8.5f' %X_train_scaled.std())

Mean AFTER scaling:  0.00000
Std  AFTER scaling:  1.00000


## Ridge Regression (L2) 

In [10]:
# check out the help function
from sklearn.linear_model import Ridge

# picking an alpha = 10 (our pentalty factor - our HYPERPARAMETER)
ridge_model = Ridge(alpha=10)
ridge_model.fit(X_train_scaled, y_train) # notice, X_Train has SCALED values

In [11]:
y_preds = ridge_model.predict(X_test_scaled)
#y_preds

In [12]:
residuals = y_test - y_preds
#residuals

In [13]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math

# variance of the target explained by the independent variables.
# how well the model fits the data. Higher is better.
r2 = r2_score(y_test, y_preds) 

# Average squared distance between predicted and actual values
# Lower is better  (outliers will exegarate this)
mse = mean_squared_error(y_test, y_preds)

# Average absolute distance between predicted and actual values
# Lower is better
mae = mean_absolute_error(y_test, y_preds)

# Starting point of the regression line on the y-axis.
# value of target when independent variables are at zero
intercept = ridge_model.intercept_

print('R2   : %14.4f' %r2)  # same as linreg.score()
print('MAE  : %14.4f' %mae)
print('MSE  : %14.4f' %mse)
print('RMSE : %14.4f\n' %math.sqrt(mse))
print('Int  : %14.4f' %intercept)
print('Coef : %14.4f' %ridge_model.coef_[0])
print('Score: %14.4f' %ridge_model.score(X_test_scaled, y_test))

R2   :         0.9717
MAE  :         0.5774
MSE  :         0.8004
RMSE :         0.8946

Int  :        14.3114
Coef :         2.1106
Score:         0.9717


In [14]:
from sklearn.linear_model import RidgeCV

ridge_cv_model = RidgeCV(alphas = (0.1, 1, 10), scoring='neg_median_absolute_error')
ridge_cv_model.fit(X_train_scaled, y_train)

In [15]:
ridge_cv_model.alpha_

np.float64(0.1)

In [16]:
y_preds = ridge_cv_model.predict(X_test_scaled)

In [17]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math

# variance of the target explained by the independent variables.
# how well the model fits the data. Higher is better.
r2 = r2_score(y_test, y_preds) 

# Average squared distance between predicted and actual values
# Lower is better  (outliers will exegarate this)
mse = mean_squared_error(y_test, y_preds)

# Average absolute distance between predicted and actual values
# Lower is better
mae = mean_absolute_error(y_test, y_preds)

# Starting point of the regression line on the y-axis.
# value of target when independent variables are at zero
intercept = ridge_cv_model.intercept_

print('R2   : %14.4f' %r2)  # same as linreg.score()
print('MAE  : %14.4f' %mae)
print('MSE  : %14.4f' %mse)
print('RMSE : %14.4f\n' %math.sqrt(mse))
print('Int  : %14.4f' %intercept)
print('Coef : %14.4f' %ridge_cv_model.coef_[0])
print('Score: %14.4f' %ridge_cv_model.score(X_test_scaled, y_test))

R2   :         0.9865
MAE  :         0.4274
MSE  :         0.3820
RMSE :         0.6181

Int  :        14.3114
Coef :         5.4077
Score:         0.9865


In [18]:
ridge_cv_model.coef_

array([ 5.40769392,  0.5885865 ,  0.40390395, -6.18263924,  4.59607939,
       -1.18789654, -1.15200458,  0.57837796, -0.1261586 ,  2.5569777 ,
       -1.38900471,  0.86059434,  0.72219553, -0.26129256,  0.17870787,
        0.44353612, -0.21362436, -0.04622473, -0.06441449])

In [19]:
cdf = pd.DataFrame(ridge_cv_model.coef_, poly_feature_names, columns=['Coef'])
cdf

Unnamed: 0,Coef
TV,5.407694
Radio,0.588587
Newspaper,0.403904
TV^2,-6.182639
TV Radio,4.596079
TV Newspaper,-1.187897
Radio^2,-1.152005
Radio Newspaper,0.578378
Newspaper^2,-0.126159
TV^3,2.556978


In [20]:
cdf.sort_values(by='Coef', ascending=False)

Unnamed: 0,Coef
TV,5.407694
TV Radio,4.596079
TV^3,2.556978
TV^2 Newspaper,0.860594
TV Radio^2,0.722196
Radio,0.588587
Radio Newspaper,0.578378
Radio^3,0.443536
Newspaper,0.403904
TV Newspaper^2,0.178708


In [21]:
cdf[cdf.Coef.abs() > 1]

Unnamed: 0,Coef
TV,5.407694
TV^2,-6.182639
TV Radio,4.596079
TV Newspaper,-1.187897
Radio^2,-1.152005
TV^3,2.556978
TV^2 Radio,-1.389005


# LASSO


In [22]:
from sklearn.linear_model import LassoCV

lasso_cv_model = LassoCV(eps=0.1, n_alphas=100, cv=5)

lasso_cv_model.fit(X_train_scaled, y_train)

In [23]:
lasso_cv_model.alpha_

np.float64(0.4943070909225831)

In [24]:
y_preds = lasso_cv_model.predict(X_test_scaled)

In [25]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math

# variance of the target explained by the independent variables.
# how well the model fits the data. Higher is better.
r2 = r2_score(y_test, y_preds) 

# Average squared distance between predicted and actual values
# Lower is better  (outliers will exegarate this)
mse = mean_squared_error(y_test, y_preds)

# Average absolute distance between predicted and actual values
# Lower is better
mae = mean_absolute_error(y_test, y_preds)

# Starting point of the regression line on the y-axis.
# value of target when independent variables are at zero
intercept = lasso_cv_model.intercept_

print('R2   : %14.4f' %r2)  # same as linreg.score()
print('MAE  : %14.4f' %mae)
print('MSE  : %14.4f' %mse)
print('RMSE : %14.4f\n' %math.sqrt(mse))
print('Int  : %14.4f' %intercept)
print('Coef : %14.4f' %lasso_cv_model.coef_[0])
print('Score: %14.4f' %lasso_cv_model.score(X_test_scaled, y_test))

R2   :         0.9547
MAE  :         0.6542
MSE  :         1.2787
RMSE :         1.1308

Int  :        14.3114
Coef :         1.0027
Score:         0.9547
