## Implementing in Python

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
combined_data = pd.read_csv('datasets/regression/combined.csv', 
                            sep=",")

In [3]:
combined_data.head()

Unnamed: 0,Date,Nike,Ralph Lauren,S&P 500,UUP
0,01-09-2018,84.720001,136.9245,2913.97998,25.26
1,01-08-2018,81.998032,132.206055,2901.52002,25.190001
2,01-07-2018,76.721031,134.36618,2816.290039,24.98
3,01-06-2018,79.262825,124.531288,2718.370117,24.940001
4,01-05-2018,71.42408,133.30751,2705.27002,24.76


In [4]:
combined_data['Date'] = pd.to_datetime(combined_data['Date'])

combined_data = combined_data.sort_values(by = 'Date')

combined_data.head()

Unnamed: 0,Date,Nike,Ralph Lauren,S&P 500,UUP
120,2008-01-09,,,,
119,2008-01-10,9.762588,41.688194,968.75,26.298273
118,2008-01-11,9.020608,38.179581,896.23999,26.347893
117,2008-01-12,8.639458,40.13274,903.25,24.502052
116,2009-01-01,7.815672,36.30381,825.880005,25.92557


In [5]:
combined_data = combined_data.reset_index()
combined_data = combined_data.drop(['index'], axis=1)

combined_data.head()

Unnamed: 0,Date,Nike,Ralph Lauren,S&P 500,UUP
0,2008-01-09,,,,
1,2008-01-10,9.762588,41.688194,968.75,26.298273
2,2008-01-11,9.020608,38.179581,896.23999,26.347893
3,2008-01-12,8.639458,40.13274,903.25,24.502052
4,2009-01-01,7.815672,36.30381,825.880005,25.92557


In [6]:
combined_data['Nike Returns'] = combined_data['Nike'].pct_change()
combined_data['Ralph Lauren Returns'] = combined_data['Ralph Lauren'].pct_change()
combined_data['S&P 500 Returns'] = combined_data['S&P 500'].pct_change()
combined_data['UUP Returns'] = combined_data['UUP'].pct_change()

combined_data.head()

Unnamed: 0,Date,Nike,Ralph Lauren,S&P 500,UUP,Nike Returns,Ralph Lauren Returns,S&P 500 Returns,UUP Returns
0,2008-01-09,,,,,,,,
1,2008-01-10,9.762588,41.688194,968.75,26.298273,,,,
2,2008-01-11,9.020608,38.179581,896.23999,26.347893,-0.076002,-0.084163,-0.074849,0.001887
3,2008-01-12,8.639458,40.13274,903.25,24.502052,-0.042253,0.051157,0.007822,-0.070056
4,2009-01-01,7.815672,36.30381,825.880005,25.92557,-0.095352,-0.095407,-0.085657,0.058098


In [7]:
combined_data = combined_data.dropna()

combined_data.head()

Unnamed: 0,Date,Nike,Ralph Lauren,S&P 500,UUP,Nike Returns,Ralph Lauren Returns,S&P 500 Returns,UUP Returns
2,2008-01-11,9.020608,38.179581,896.23999,26.347893,-0.076002,-0.084163,-0.074849,0.001887
3,2008-01-12,8.639458,40.13274,903.25,24.502052,-0.042253,0.051157,0.007822,-0.070056
4,2009-01-01,7.815672,36.30381,825.880005,25.92557,-0.095352,-0.095407,-0.085657,0.058098
5,2009-01-02,7.173147,30.499441,735.090027,26.45507,-0.08221,-0.159883,-0.109931,0.020424
6,2009-01-03,8.098936,37.383286,797.869995,25.61586,0.129063,0.225704,0.085404,-0.031722


In [8]:
from sklearn import datasets, linear_model
from sklearn.linear_model import SGDRegressor, LinearRegression

#### Regress the returns of Nike on the SnP500
But let's consider the effect of UUP as well in this regression, as this may influence the stock returns. We now have a multiple regression problem at hand.

In [9]:
snp500_data = combined_data["S&P 500 Returns"]
uup_data = combined_data["UUP Returns"]

In [10]:
x = np.vstack((snp500_data, uup_data))
x

array([[-0.07484904,  0.00782158, -0.08565734, -0.1099312 ,  0.08540446,
         0.09392508,  0.05308145,  0.00019583,  0.07414173,  0.03356019,
         0.03572335, -0.019762  ,  0.05736406,  0.01777057, -0.03697425,
         0.02851369,  0.05879643,  0.01475923, -0.08197584, -0.05388244,
         0.06877785, -0.04744918,  0.0875511 ,  0.03685599, -0.00229025,
         0.06530004,  0.02264557,  0.03195656, -0.00104731,  0.02849538,
        -0.01350095, -0.01825746, -0.02147443, -0.05679111, -0.07176199,
         0.10772304, -0.00505872,  0.00853276,  0.04358306,  0.04058946,
         0.03133231, -0.00749745, -0.06265073,  0.03955498,  0.01259757,
         0.01976337,  0.02423615, -0.01978941,  0.00284672,  0.00706823,
         0.0504281 ,  0.01106065,  0.03598772,  0.01808577,  0.02076281,
        -0.0149993 ,  0.04946208, -0.03129802,  0.02974952,  0.04459575,
         0.02804947,  0.02356279, -0.03558291,  0.04311703,  0.00693217,
         0.00620079,  0.02103028,  0.01905833, -0.0

In [11]:
x = x.T

x

array([[-0.07484904,  0.00188682],
       [ 0.00782158, -0.07005649],
       [-0.08565734,  0.05809791],
       [-0.1099312 ,  0.02042385],
       [ 0.08540446, -0.03172209],
       [ 0.09392508, -0.01170045],
       [ 0.05308145, -0.0635359 ],
       [ 0.00019583,  0.00716393],
       [ 0.07414173, -0.02426783],
       [ 0.03356019, -0.00257283],
       [ 0.03572335, -0.02020634],
       [-0.019762  , -0.00394913],
       [ 0.05736406, -0.02334802],
       [ 0.01777057,  0.04104645],
       [-0.03697425,  0.01603121],
       [ 0.02851369,  0.0081023 ],
       [ 0.05879643,  0.00676817],
       [ 0.01475923,  0.00672267],
       [-0.08197584,  0.05759605],
       [-0.05388244, -0.01104971],
       [ 0.06877785, -0.05267361],
       [-0.04744918,  0.01600686],
       [ 0.0875511 , -0.05306807],
       [ 0.03685599, -0.02057791],
       [-0.00229025,  0.05051412],
       [ 0.06530004, -0.03361714],
       [ 0.02264557, -0.01541165],
       [ 0.03195656, -0.0120751 ],
       [-0.00104731,

In [12]:
y1 = combined_data["Nike Returns"]

In [13]:
reg_obj = linear_model.LinearRegression()

In [14]:
reg_obj.fit(x,y1)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
reg_obj.score(x,y1)

0.04698479697879887

In [16]:
reg_obj.coef_

array([ 0.43938334, -0.48866406])

In [17]:
reg_obj.intercept_

0.01955141536286277

### We will now regress the returns of Infosys on the SnP500 index. But let's consider the effect of Nifty on the prices too, in this regression, as this will certainly have its influence on the stock returns. We now have a multiple regression problem at hand.

In [18]:
y2 = combined_data["Ralph Lauren Returns"]

In [19]:
reg_obj = linear_model.LinearRegression()

In [20]:
reg_obj.fit(x, y2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
reg_obj.score(x, y2)

0.3096669219502629

In [22]:
reg_obj.coef_

array([1.28397145, 0.03648715])

In [23]:
reg_obj.intercept_

0.0009191074129100054

Scikit learn treats the problem as a machine learning problem, which makes it hard for us to get detailed diagnostics of the llinear model that we fitted. e fix this, especially for multiple regression using the Statsmdels librar

In [24]:
import statsmodels.api as sm

In [25]:
X = sm.add_constant(x)

#### Fit OLS model

In [26]:
model_ols = sm.OLS(y2, X)

results_ols = model_ols.fit()

In [27]:
print(results_ols.summary())

                             OLS Regression Results                             
Dep. Variable:     Ralph Lauren Returns   R-squared:                       0.310
Model:                              OLS   Adj. R-squared:                  0.298
Method:                   Least Squares   F-statistic:                     26.02
Date:                  Sat, 29 Dec 2018   Prob (F-statistic):           4.63e-10
Time:                          15:50:23   Log-Likelihood:                 143.08
No. Observations:                   119   AIC:                            -280.2
Df Residuals:                       116   BIC:                            -271.8
Df Model:                             2                                         
Covariance Type:              nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0009      0.007