In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_absolute_error, mean_squared_error, SCORERS
import scipy as sp
from joblib import dump,load
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNetCV, ElasticNet
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV

In [29]:
df = pd.read_csv('E:\Machine Learning\DATA\Advertising.csv')

# TTS 

Let's pull our data and perform a TTS on it.

In [30]:
X = df.drop('sales', axis=1)
y = df['sales']

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3,
                                       random_state=101)

In [31]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Let's try our first model only using Ridge with an alpha of 100.

In [32]:
model = Ridge(alpha=100)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

MAE = mean_absolute_error(y_test,y_pred)
RMSE = np.sqrt(mean_squared_error(y_test,y_pred))

print(MAE)
print(RMSE)

2.1631741364394363
2.709571144855608


As we can see we don't have the best results, let's use a lower alpha and compare the results.

In [33]:
model_2 = Ridge(alpha=1)
model_2.fit(X_train,y_train)

y_pred = model_2.predict(X_test)

MAE = mean_absolute_error(y_test,y_pred)
RMSE = np.sqrt(mean_squared_error(y_test,y_pred))

print(MAE)
print(RMSE)

1.216876844358058
1.5228334050147285


### Validation

The TTS method has a disadvantage of not having a portion of data that can report a kpi on truly unseen data. Meaning that adjustingg the hyperparameters on the test datta is fair as it isn't considered data leakage, but it might cause issues during reporting.

To do this we are going to separate our data into three sets:
- Train
- Validation
- Test

For cases where we have millions of entries, we can use a low percentage for our test data, but for cases with a few entries we would need a larger test set.

We are going to use the train data for training and the validation data for adjusting our hyperparameters, repeating as many times as needed.

Finally we are going to test usingg the test data, this will only be used to get our kpi for our model.

In [34]:
X = df.drop('sales', axis=1)
y = df['sales']

# Let's do our first split

X_train, X_other, y_train, y_other = tts(X, y, test_size=0.3,
                                       random_state=101)

# Second  split

X_eval, X_test, y_eval, y_test = tts(X_other, y_other, test_size=0.5,
                                       random_state=101)

Let's create the scaler and transform our X data.

In [35]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [36]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_eval = scaler.transform(X_eval)

In [37]:
model_1 = Ridge(alpha=100)
model_1.fit(X_train,y_train)
y_eval_pred = model_1.predict(X_eval)
mean_squared_error(y_eval,y_eval_pred)

7.320101458823871

In [38]:
model_2 = Ridge(alpha=1)
model_2.fit(X_train,y_train)
y_eval_pred = model_2.predict(X_eval)
mean_squared_error(y_eval,y_eval_pred)

2.383783075056986

In [39]:
y_final_pred = model_2.predict(X_test)
mean_squared_error(y_test,y_final_pred)

2.254260083800517

### Cross_val_score function

We start with our whole data, then split it into a large training set and a small test set.

Then we choose a k-fold value, take in mind that the higher the value the longer the computation. Here we are going to cross validate by evaluating our model with all portions of the training set. Finally obtaining the average error.

In [40]:
X = df.drop('sales', axis=1)
y = df['sales']

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3,
                                       random_state=101)

scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [41]:
model = Ridge(alpha=100)
scores = cross_val_score(model,X_train,y_train,
                         scoring='neg_mean_squared_error', cv=5)

abs(scores.mean())

8.215396464543607

In [42]:
model_2 = Ridge(alpha=1)
scores = cross_val_score(model_2,X_train,y_train,
                         scoring='neg_mean_squared_error', cv=5)

abs(scores.mean())

3.344839296530696

In [43]:
model_2.fit(X_train,y_train)
y_final_pred = model_2.predict(X_test)
mean_squared_error(y_test, y_final_pred)

2.319021579428752

### Cross_validate function

It allows us to  view multiple kpis from a cross validation model.

Let's initialize our data.

In [44]:
X = df.drop('sales', axis=1)
y = df['sales']

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3,
                                       random_state=101)

scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Now we can create the model and create our scores.

In [45]:
model = Ridge(alpha=100)

scores = cross_validate(model, X_train, y_train,
                        scoring=['neg_mean_squared_error',
                                 'neg_mean_absolute_error'],
                                 cv=10)

scores = pd.DataFrame(scores)

In [46]:
scores.mean()

fit_time                        0.001100
score_time                      0.000200
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

Now let's reduce our alpha and check the mean values again.

In [47]:
model = Ridge(alpha=1)

scores = cross_validate(model, X_train, y_train,
                        scoring=['neg_mean_squared_error',
                                 'neg_mean_absolute_error'],
                                 cv=10)

scores = pd.DataFrame(scores)

In [49]:
scores.mean()

fit_time                        0.000600
score_time                      0.000599
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

We can see that we reduced our error rate by a half. Now we can fit it into our model.

In [50]:
model.fit(X_train,y_train)

y_final_pred = model.predict(X_test)

mean_squared_error(y_test,y_final_pred)

2.319021579428752

### Grid Search

More complex models have multiple adjustable parameters. A grid search is a  form of training and validating a model on evere possible combination of hyperparameters.

We first are going to initialize our data.


In [52]:
X = df.drop('sales', axis=1)
y = df['sales']

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3,
                                       random_state=101)

scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Now we can create our base model, leaving all hyperparametes as stock. Next is adding our parameter grid with this we will create the grid model, which will test all of the hyperparameters.

In [55]:
base_EN_model = ElasticNet()

param_grid = {'alpha':[0.1,1,5,10,50,100],
              'l1_ratio':[.1,.5,.7,.95,.99,1]}

grid_model = GridSearchCV(estimator=base_EN_model, 
                          param_grid=param_grid, 
                          scoring='neg_mean_squared_error', 
                          cv=5, verbose=2)

grid_model.fit(X_train,y_train)

grid_model.best_estimator_

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.7; total time=   0.0s
[CV] END ............................alpha=0.1,

ElasticNet(alpha=0.1, l1_ratio=1)

Now we can predict of this model.

In [57]:
y_pred = grid_model.predict(X_test)
mean_squared_error(y_test, y_pred)

2.3873426420874737

One note from this type of model, we are getting simpler models but it comes at the expense of computation time, which with bigger models it might become an issue.