<a href="https://colab.research.google.com/github/patakrob/scrape/blob/main/O'Reilly_Class_Multiple%2C_Lasso_and_Ridge_Linear_Regression_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

import pandas_datareader.data as pdr
!pip install yfinance
import yfinance as yf

import matplotlib.pyplot as plt
plt.style.use('seaborn')

#Import Data

In [None]:
start = datetime(2021, 4, 4)
end = datetime(2021, 11, 3)

stock = yf.Ticker('AAPL').history(start=start, end=end) #Apple Inc.
market = yf.Ticker('SPY').history(start=start, end=end) #S&P 500 index. Broad market value weighted index that affects all stocks
vix = yf.Ticker('VXX').history(start=start, end=end)   #Volatility index. The 30 day expected volatility of the market implied by option prices on the S&P 500
dxy = yf.Ticker('UUP').history(start=start, end=end) #Dollar index. A weighted index of the value of the US dollar relative to a basket of major currencies
junk = yf.Ticker('JNK').history(start=start, end=end)   #Junk bond index. An index of high yielding/risky corporate bonds

#Train Model

In [None]:
#Creat target dataframe
target = pd.DataFrame()
target['return'] = stock['Close'].pct_change(1)*100 #data lagged by a day to avoid look-ahead bias
target = target.dropna() #Drop NA in the first row
target.head()

Unnamed: 0_level_0,return
Date,Unnamed: 1_level_1
2021-04-06,0.246223
2021-04-07,1.339037
2021-04-08,1.923379
2021-04-09,2.025158
2021-04-12,-1.323302


In [None]:
#Create features dataframe
features = pd.DataFrame()
features['market'] = market['Close'].pct_change(1)*100
features['vix'] = vix['Close'].diff() #VIX is volatility index and is measured in percentage terms so you only need to take the difference
features['dxy'] = dxy['Close'].pct_change(1)*100
features['junk'] = junk['Close'].pct_change(1)*100
features = features.dropna() #Drop NA in the first row
features.head()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-06,-0.059056,0.0,-0.240672,0.174718
2021-04-07,0.115724,-1.120003,0.120622,0.00918
2021-04-08,0.474682,-0.639999,-0.441762,0.036722
2021-04-09,0.727016,-0.079998,0.080672,-0.082593
2021-04-12,0.036456,-0.600002,-0.040307,-0.082633


In [None]:
#Run multiple linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

regression = LinearRegression()
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.25, random_state=0)

#Model is trained on 75% of the data
model = regression.fit(features_train, target_train)

print("Model Intercept:", model.intercept_)
print("Model Coefficients:", model.coef_)

Model Intercept: [-0.01440816]
Model Coefficients: [[ 1.49020717  0.07259522 -0.10914577 -0.22047601]]


#Evaluate Model

In [None]:
#Use default R2 performance metric
print("Training score: ", model.score(features_train, target_train))
print("Test score: ", model.score(features_test, target_test))

Training score:  0.5423937563700386
Test score:  0.5579060612119445


#Lasso Regression/L1 Regularization

In [None]:
#Remove less informative features with Lasso Regression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() #Scales input data so that it is has a mean of zero and variance of one
features_standardized = scaler.fit_transform(features) #Every feature now has zero mean and unit variance
lasso_regression = Lasso (alpha = 0.1) #alpha is a hyperparameter (default = 1). Increasing alpha increases regularization

features_standardized_train, features_standardized_test, target_train, target_test = train_test_split(features_standardized, target, test_size=0.25, random_state=0)
lasso_model = lasso_regression.fit(features_standardized_train, target_train)

print("Lasso model intercept:", lasso_model.intercept_)
print("Lasso model coefficients:", lasso_model.coef_)

print("Lasso training score: ", lasso_model.score(features_standardized_train, target_train))
print("Lasso test score: ", lasso_model.score(features_standardized_test, target_test))

Lasso model intercept: [0.11295063]
Lasso model coefficients: [ 0.82069163 -0.         -0.          0.        ]
Lasso training score:  0.5329036902899487
Lasso test score:  0.5451706690104516


#Ridge Regression/L2 Regularization

In [None]:
#Reduce effects of all coefficients/collinearity with Ridge Regression
from sklearn.linear_model import Ridge

ridge_regression = Ridge (alpha=10) #alpha is a hyperparameter. Increasing it increases regularization

features_standardized_train, features_standardized_test, target_train, target_test = train_test_split(features_standardized, target, test_size=0.25, random_state=0)
ridge_model = ridge_regression.fit(features_standardized_train, target_train)

print("Ridge model intercept:", ridge_model.intercept_)
print("Ridge model coefficients:", ridge_model.coef_)

print("Ridge training score: ", ridge_model.score(features_standardized_train, target_train))
print("Ridge test score: ", ridge_model.score(features_standardized_test, target_test))

Ridge model intercept: [0.10985986]
Ridge model coefficients: [[ 0.76572876 -0.049869   -0.04121561  0.04678478]]
Ridge training score:  0.5317814889169699
Ridge test score:  0.5411044713799371
