In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics

In [2]:
# read in stock data for one symbol
data = pd.read_csv("data/stock_data.csv")

In [3]:
# print data
data.head()

Unnamed: 0,symbol,marketCap,ipoYear,sector,industry,date,open,high,low,close,volume,adjclose
0,ABCD,$220.4M,-1,Consumer Services,Publishing,2016-11-11,4.9,4.95,4.8,4.95,76300,4.95
1,ABCD,$220.4M,-1,Consumer Services,Publishing,2016-11-10,5.35,5.35,4.77,4.9,180800,4.9
2,ABCD,$220.4M,-1,Consumer Services,Publishing,2016-11-09,5.12,5.51,4.97,5.51,40700,5.51
3,ABCD,$220.4M,-1,Consumer Services,Publishing,2016-11-08,5.3,5.35,5.08,5.15,8900,5.15
4,ABCD,$220.4M,-1,Consumer Services,Publishing,2016-11-07,5.01,5.29,5.01,5.25,15800,5.25


In [4]:
# keep only relevant columns
data = data[['date', 'open', 'high', 'low', 'close', 'volume', 'adjclose']]
data.head()

Unnamed: 0,date,open,high,low,close,volume,adjclose
0,2016-11-11,4.9,4.95,4.8,4.95,76300,4.95
1,2016-11-10,5.35,5.35,4.77,4.9,180800,4.9
2,2016-11-09,5.12,5.51,4.97,5.51,40700,5.51
3,2016-11-08,5.3,5.35,5.08,5.15,8900,5.15
4,2016-11-07,5.01,5.29,5.01,5.25,15800,5.25


In [5]:
# add feature for percent change between open and close
data['percentChange'] = data['close'] / data['open'] - 1
data.head()

Unnamed: 0,date,open,high,low,close,volume,adjclose,percentChange
0,2016-11-11,4.9,4.95,4.8,4.95,76300,4.95,0.010204
1,2016-11-10,5.35,5.35,4.77,4.9,180800,4.9,-0.084112
2,2016-11-09,5.12,5.51,4.97,5.51,40700,5.51,0.076172
3,2016-11-08,5.3,5.35,5.08,5.15,8900,5.15,-0.028302
4,2016-11-07,5.01,5.29,5.01,5.25,15800,5.25,0.047904


In [6]:
# create new feature for the percent change of the previous trading day
data['percentchange-1'] = data['percentChange'].shift(-1)
data.head()

Unnamed: 0,date,open,high,low,close,volume,adjclose,percentChange,percentchange-1
0,2016-11-11,4.9,4.95,4.8,4.95,76300,4.95,0.010204,-0.084112
1,2016-11-10,5.35,5.35,4.77,4.9,180800,4.9,-0.084112,0.076172
2,2016-11-09,5.12,5.51,4.97,5.51,40700,5.51,0.076172,-0.028302
3,2016-11-08,5.3,5.35,5.08,5.15,8900,5.15,-0.028302,0.047904
4,2016-11-07,5.01,5.29,5.01,5.25,15800,5.25,0.047904,-0.038685


In [7]:
# repeat again to get the pecent change from two days ago
data['percentchange-2'] = data['percentChange'].shift(-2)
data.head()

Unnamed: 0,date,open,high,low,close,volume,adjclose,percentChange,percentchange-1,percentchange-2
0,2016-11-11,4.9,4.95,4.8,4.95,76300,4.95,0.010204,-0.084112,0.076172
1,2016-11-10,5.35,5.35,4.77,4.9,180800,4.9,-0.084112,0.076172,-0.028302
2,2016-11-09,5.12,5.51,4.97,5.51,40700,5.51,0.076172,-0.028302,0.047904
3,2016-11-08,5.3,5.35,5.08,5.15,8900,5.15,-0.028302,0.047904,-0.038685
4,2016-11-07,5.01,5.29,5.01,5.25,15800,5.25,0.047904,-0.038685,-0.029354


In [8]:
# reorder to have oldest date at top
# this is useful for creating rolling calculations
data.sort_values('date', ascending=True, inplace=True)

# reset the index since sorting will sort the index as well
data.reset_index(inplace=True, drop=True)

In [9]:
# create a simple moving average using rolling calculations
data['shortSma'] = data['close'].rolling(5).mean()
data.head(10)

Unnamed: 0,date,open,high,low,close,volume,adjclose,percentChange,percentchange-1,percentchange-2,shortSma
0,2012-11-26,1.0,1.06,1.0,1.04,16600,1.04,0.04,,,
1,2012-11-27,1.0,1.03,0.93,0.99,7500,0.99,-0.01,0.04,,
2,2012-11-28,1.0,1.1,1.0,1.02,13000,1.02,0.02,-0.01,0.04,
3,2012-11-29,1.01,1.01,0.92,0.93,16900,0.93,-0.079208,0.02,-0.01,
4,2012-11-30,1.0,1.22,0.92,0.93,4100,0.93,-0.07,-0.079208,0.02,0.982
5,2012-12-03,0.93,1.13,0.93,1.09,67500,1.09,0.172043,-0.07,-0.079208,0.992
6,2012-12-04,1.16,1.22,1.0,1.01,26400,1.01,-0.12931,0.172043,-0.07,0.996
7,2012-12-05,1.04,1.15,1.01,1.01,13000,1.01,-0.028846,-0.12931,0.172043,0.994
8,2012-12-06,1.17,1.18,1.02,1.09,27200,1.09,-0.068376,-0.028846,-0.12931,1.026
9,2012-12-07,1.16,1.25,1.11,1.12,37200,1.12,-0.034483,-0.068376,-0.028846,1.064


In [10]:
# shift the SMA so that a particular day knows the SMA of the previous day
data['shortSma-1'] = data['shortSma'].shift(1)
data.head(10)

Unnamed: 0,date,open,high,low,close,volume,adjclose,percentChange,percentchange-1,percentchange-2,shortSma,shortSma-1
0,2012-11-26,1.0,1.06,1.0,1.04,16600,1.04,0.04,,,,
1,2012-11-27,1.0,1.03,0.93,0.99,7500,0.99,-0.01,0.04,,,
2,2012-11-28,1.0,1.1,1.0,1.02,13000,1.02,0.02,-0.01,0.04,,
3,2012-11-29,1.01,1.01,0.92,0.93,16900,0.93,-0.079208,0.02,-0.01,,
4,2012-11-30,1.0,1.22,0.92,0.93,4100,0.93,-0.07,-0.079208,0.02,0.982,
5,2012-12-03,0.93,1.13,0.93,1.09,67500,1.09,0.172043,-0.07,-0.079208,0.992,0.982
6,2012-12-04,1.16,1.22,1.0,1.01,26400,1.01,-0.12931,0.172043,-0.07,0.996,0.992
7,2012-12-05,1.04,1.15,1.01,1.01,13000,1.01,-0.028846,-0.12931,0.172043,0.994,0.996
8,2012-12-06,1.17,1.18,1.02,1.09,27200,1.09,-0.068376,-0.028846,-0.12931,1.026,0.994
9,2012-12-07,1.16,1.25,1.11,1.12,37200,1.12,-0.034483,-0.068376,-0.028846,1.064,1.026


In [11]:
# add a feature for the SMA from two days ago
data['shortSma-2'] = data['shortSma'].shift(2)
data.head(10)

Unnamed: 0,date,open,high,low,close,volume,adjclose,percentChange,percentchange-1,percentchange-2,shortSma,shortSma-1,shortSma-2
0,2012-11-26,1.0,1.06,1.0,1.04,16600,1.04,0.04,,,,,
1,2012-11-27,1.0,1.03,0.93,0.99,7500,0.99,-0.01,0.04,,,,
2,2012-11-28,1.0,1.1,1.0,1.02,13000,1.02,0.02,-0.01,0.04,,,
3,2012-11-29,1.01,1.01,0.92,0.93,16900,0.93,-0.079208,0.02,-0.01,,,
4,2012-11-30,1.0,1.22,0.92,0.93,4100,0.93,-0.07,-0.079208,0.02,0.982,,
5,2012-12-03,0.93,1.13,0.93,1.09,67500,1.09,0.172043,-0.07,-0.079208,0.992,0.982,
6,2012-12-04,1.16,1.22,1.0,1.01,26400,1.01,-0.12931,0.172043,-0.07,0.996,0.992,0.982
7,2012-12-05,1.04,1.15,1.01,1.01,13000,1.01,-0.028846,-0.12931,0.172043,0.994,0.996,0.992
8,2012-12-06,1.17,1.18,1.02,1.09,27200,1.09,-0.068376,-0.028846,-0.12931,1.026,0.994,0.996
9,2012-12-07,1.16,1.25,1.11,1.12,37200,1.12,-0.034483,-0.068376,-0.028846,1.064,1.026,0.994


In [12]:
#trim the obervations with missing calculated data
data.dropna(inplace=True)
data.head(10)

Unnamed: 0,date,open,high,low,close,volume,adjclose,percentChange,percentchange-1,percentchange-2,shortSma,shortSma-1,shortSma-2
6,2012-12-04,1.16,1.22,1.0,1.01,26400,1.01,-0.12931,0.172043,-0.07,0.996,0.992,0.982
7,2012-12-05,1.04,1.15,1.01,1.01,13000,1.01,-0.028846,-0.12931,0.172043,0.994,0.996,0.992
8,2012-12-06,1.17,1.18,1.02,1.09,27200,1.09,-0.068376,-0.028846,-0.12931,1.026,0.994,0.996
9,2012-12-07,1.16,1.25,1.11,1.12,37200,1.12,-0.034483,-0.068376,-0.028846,1.064,1.026,0.994
10,2012-12-10,1.2,1.2,1.0,1.01,30600,1.01,-0.158333,-0.034483,-0.068376,1.048,1.064,1.026
11,2012-12-11,1.0,1.16,1.0,1.03,13200,1.03,0.03,-0.158333,-0.034483,1.052,1.048,1.064
12,2012-12-12,1.05,1.14,1.03,1.03,7000,1.03,-0.019048,0.03,-0.158333,1.056,1.052,1.048
13,2012-12-13,1.01,1.14,1.01,1.04,10400,1.04,0.029703,-0.019048,0.03,1.046,1.056,1.052
14,2012-12-14,1.02,1.14,0.95,0.95,50200,0.95,-0.068627,0.029703,-0.019048,1.012,1.046,1.056
15,2012-12-17,1.02,1.15,0.95,1.03,14000,1.03,0.009804,-0.068627,0.029703,1.016,1.012,1.046


In [13]:
# separate features 
features = ['percentchange-1', 'percentchange-2', 'shortSma-1', 'shortSma-2']
X = data[features]

# separate the target vector
y = data['percentChange']

In [14]:
# scale the features with unit-mean standard deviation
X = pd.DataFrame(preprocessing.scale(X), index = X.index, columns = X.columns)

In [15]:
# create training and testing sets by splitting the full dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [16]:
# create a linear regression object
my_linreg = LinearRegression()

In [17]:
# fit the linear regression object to the training data
my_linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
# print out the linear regression coefficients
print(my_linreg.coef_)


[ -1.83861458e-03   3.81734250e-06  -1.14631594e-01   1.14308136e-01]


In [19]:
# put coefficients in dataframe, take absolute value and sort
coffDf = pd.DataFrame(list(zip(X.columns,np.absolute(my_linreg.coef_))), columns=['Feature','Coefficient'])
coffDf.sort_values('Coefficient', ascending=False)

Unnamed: 0,Feature,Coefficient
2,shortSma-1,0.114632
3,shortSma-2,0.114308
0,percentchange-1,0.001839
1,percentchange-2,4e-06


In [20]:
# make predictions on the testing set
y_prediction = my_linreg.predict(X_test)
print(y_prediction)

[ -1.29455651e-03  -3.96322055e-03   6.59419710e-04  -2.59123916e-03
   1.95729587e-03   3.82718757e-03  -4.94597329e-03   2.88532953e-03
  -2.18723623e-03  -2.78564265e-03   2.24264894e-03  -2.14085641e-03
   5.18296801e-03  -5.71483878e-03  -5.88216743e-04   3.16728605e-04
  -2.61084259e-04   2.01959059e-03  -2.34961277e-03  -2.43163486e-03
   1.79225243e-03   2.64776225e-03   3.02945196e-03  -1.45447217e-04
   3.71679868e-03   2.23369595e-03  -1.06319715e-03   2.11801614e-03
   1.24211895e-03   2.17182682e-03   1.46175361e-03   2.53481327e-03
  -2.05240431e-03  -2.07984762e-04  -1.33967370e-04   2.39843868e-03
   1.26953692e-03   1.02187032e-03   4.24622694e-04   4.20466604e-03
   7.90852921e-05   2.60457521e-03  -9.02863982e-04  -1.11414165e-04
   7.70552481e-03  -2.37259626e-03   1.17625644e-03  -5.22115019e-05
   2.65171041e-03   5.15963112e-03   2.63924883e-03   9.92466798e-04
  -1.06224364e-02   5.15723640e-05   1.74215925e-03  -2.48108974e-03
   5.88346115e-04  -8.76757185e-04

In [21]:
# calculate the mean squared error of the predictions
mse = metrics.mean_squared_error(y_test, y_prediction)

# take the square root of MSE to get Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print(rmse)

0.0447211378805
