Using rolling linear regression to predict MSFT closing prices

In [15]:
'''Predict stock market prices, make billions.'''

# pylint: disable=invalid-name

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# load data in numpy array
STOCK_SYMBOL = 'MSFT'
ALL_PRICES = pd.read_csv('./aapl5.csv')
STOCK_PRICES = np.array(ALL_PRICES)

# csv column indexes

DATE_COL = 0
#SYMBOL = 1
OPEN_COL = 1
CLOSE_COL = 4
LOW_COL = 3
HIGH_COL = 2
VOLUME_COL = 5

# hyper-parameters
WINDOW_SIZE = 20
TRAINING_RATIO = 0.8

In [10]:
ALL_PRICES.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,28-Jun-17,144.49,146.11,143.16,145.83,21982505
1,27-Jun-17,145.01,146.16,143.62,143.73,24761891
2,26-Jun-17,147.17,148.28,145.38,145.82,25692361
3,23-Jun-17,145.13,147.16,145.11,146.28,35439389
4,22-Jun-17,145.77,146.7,145.12,145.63,19106294


SyntaxError: invalid syntax (<ipython-input-22-161d3f41d1a2>, line 1)

In [13]:
def get_r_squared(actuals, predicted):
    '''Calculate r_squared'''
    d1 = actuals - predicted
    d2 = actuals - actuals.mean()
    r_2 = 1 - d1.dot(d1) / d2.dot(d2)
    return r_2


def convert_numpy_dates_to_panda(numpy_dates):
    '''Convert numpy dates to pandas dates'''
    pd_dates = []
    for date in numpy_dates.flatten():
        pd_dates.append(pd.Timestamp(date))
    return pd_dates

In [25]:
# X is matrix of features and bias term
X = np.array(
    STOCK_PRICES[WINDOW_SIZE:, [OPEN, LOW, HIGH, VOLUME]],
    dtype='float'
)
X = np.concatenate((X, np.ones((len(X), 1))), axis=1)
num_orig_cols = X.shape[1]


# Y is matrix of actual output values
Y = np.array(
    STOCK_PRICES[WINDOW_SIZE:, CLOSE_COL],
    dtype='float'
)


# Dates are not features but we want to save them for plotting later
'''
dates = np.array(
    STOCK_PRICES[WINDOW_SIZE:, [DATE_COL]],
    dtype='datetime64'
)
'''

"\ndates = np.array(\n    STOCK_PRICES[WINDOW_SIZE:, [DATE_COL]],\n    dtype='datetime64'\n)\n"

In [26]:
# Add previous closing prices to X for 'Rolling Window Linear Regression'
X = np.concatenate(
    (X, np.zeros((len(X), WINDOW_SIZE))),
    axis=1
)
for row in range(len(X)):
    for day in range(1, WINDOW_SIZE + 1):
        col_offset = num_orig_cols - 1 + day
        row_offset = WINDOW_SIZE + row - day
        X[row, col_offset] = STOCK_PRICES[row_offset, CLOSE_COL]

assert X.shape[1] == (WINDOW_SIZE + num_orig_cols)
# pd.DataFrame(X).to_csv('X.csv')
# pd.DataFrame(Y).to_csv('Y.csv')

In [27]:
# Create training and test sets
train_indexes = np.random.choice(
    len(X),
    round(len(X) * TRAINING_RATIO),
    replace=False
)
train_indexes.sort()
train_indexes.tolist()

test_indexes = list(range(len(X)))
for value in train_indexes:
    test_indexes.remove(value)

assert len(train_indexes) + len(test_indexes) == len(X)
for i, value in enumerate(train_indexes):
    assert value not in test_indexes

X_train = X[train_indexes]
Y_train = Y[train_indexes]
X_test = X[test_indexes]
Y_test = Y[test_indexes]

TypeError: 'float' object cannot be interpreted as an index

In [None]:
# Solve for w (weights) on training data
w = np.linalg.solve(X_train.T.dot(X_train), X_train.T.dot(Y_train))
Y_train_hat = X_train.dot(w)
train_r_2 = get_r_squared(Y_train, Y_train_hat)
print('r_squared of training set is:', train_r_2)

train_dates = convert_numpy_dates_to_panda(dates[train_indexes])
plt.title('Training set')
plt.scatter(train_dates, Y_train)
plt.plot(train_dates, Y_train_hat, color='red')
plt.show()

In [None]:
# Use w from training data to predict values in test data
Y_test_hat = X_test.dot(w)
test_r_2 = get_r_squared(Y_test, Y_test_hat)
print('r_squared of test set is:', test_r_2)

test_dates = convert_numpy_dates_to_panda(dates[test_indexes])
plt.title('Testing set')
plt.scatter(test_dates, Y_test)
plt.plot(test_dates, Y_test_hat, color='red')
plt.show()
