#  LR Implementation - Yuanxing

Reference (continue to be updated):
- https://www.analyticsvidhya.com/blog/2018/10/predicting-stock-price-machine-learningnd-deep-learning-techniques-python/
- https://github.com/SeanPLeary/time-series-h2o-automl-example/blob/master/h2o_automl_example_with_multivariate_time_series.ipynb

In [2]:
# import packages
import pandas as pd
import numpy as np

# to plot within notebook
import matplotlib.pyplot as plt
%matplotlib inline

# setting figure size
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20,10

# for normalizing data (why?)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

ModuleNotFoundError: No module named 'pandas'

In [1]:
# return df of the stock with given nasdaq
def read_stock(nasdaq):
    # read the file
    nasdaq = nasdaq.lower()
    path = 'datasets/stocks/' + nasdaq + '.csv'
    df = pd.read_csv(path)
    return df

In [None]:
# setting index as date
df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
df.index = df['Date']

# specify columns to plot
values = df.values
groups = [1, 2, 3, 4, 6]
i = 1
# plot each column
plt.figure()
for group in groups:
    plt.subplot(len(groups), 1, i)
    plt.plot(values[:, group])
    plt.title(df.columns[group], y=0.5, loc='right')
    i += 1
plt.show()

From the above graph, data before 2010 (or 2015) doesn't seem to be very representive.

In [None]:
# discard data before year 2015
df = df[df['Date'] >= '2015-01-01']
df.drop('Date', axis=1)

## Daily with 1 step and 5 steps, with or without Volume

In [None]:
df_processed = df.copy()

# leave Close only
df_processed = df_processed[['Close', 'Volume']]

num_lags = 5 # number of lags and window lenghts for mean aggregation
delay = 1 # predict target one step ahead
for column in df_processed:
    for lag in range(1,num_lags+1):
        df_processed[column + '_lag' + str(lag)] = df_processed[column].shift(lag*-1-(delay-1))      

df_processed.drop('Volume', axis=1)
df_processed.dropna(inplace=True)

df_processed

In [None]:
df_processed.reset_index(drop=True,inplace=True)
df_train = df_processed.loc[:int(df_processed.shape[0]*0.7),:]
df_test = df_processed.loc[int(df_processed.shape[0]*0.7):,:]

In [None]:
plt.plot(df_train.index,df_train['Close'])
plt.plot(df_test.index,df_test['Close'])
plt.ylabel('Close',fontsize=18)
plt.legend(['train','test'])
plt.show()

### Data Splitting

In [None]:
# 1 step Close only
x_train_lag1 = df_train[['Close_lag1']]

# 1 step Close and Vol
mask_lag1_with_vol = df_train.columns.str.contains('lag1')
x_train_lag1_with_vol = df_train[df_train.columns[mask_lag1_with_vol]]

# 5 steps Close only
mask_lag5 = df_train.columns.str.contains('Close_lag')
x_train_lag5 = df_train[df_train.columns[mask_lag5]]

# 5 step Close and Vol
x_train_lag5_with_vol = df_train.drop('Close', axis=1)

y_train = df_train['Close']

### Test Data ###
# 1 step Close only
x_test_lag1 = df_test[['Close_lag1']]

# 1 step Close and Vol
x_test_lag1_with_vol = df_test[df_test.columns[mask_lag1_with_vol]]

# 5 steps Close only
x_test_lag5 = df_test[df_test.columns[mask_lag5]]

# 5 step Close and Vol
x_test_lag5_with_vol = df_test.drop('Close', axis=1)

y_test = df_test['Close']

In [None]:
# implement linear regression
from sklearn.linear_model import LinearRegression
# 1 step Close only
model_lag1 = LinearRegression()
model_lag1.fit(x_train_lag1, y_train)

# 1 step Close and Vol
model_lag1_with_vol = LinearRegression()
model_lag1_with_vol.fit(x_train_lag1_with_vol, y_train)

# 5 steps Close only
model_lag5 = LinearRegression()
model_lag5.fit(x_train_lag5, y_train)

# 5 steps Close and Vol
model_lag5_with_vol = LinearRegression()
model_lag5_with_vol.fit(x_train_lag5_with_vol, y_train)

# make predictions and find the rmse
preds_lag1 = model_lag1.predict(x_test_lag1)
preds_lag1_with_vol = model_lag1_with_vol.predict(x_test_lag1_with_vol)
preds_lag5 = model_lag5.predict(x_test_lag5)
preds_lag5_with_vol = model_lag5_with_vol.predict(x_test_lag5_with_vol)

rms_lag1=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag1)),2)))
rms_lag1_with_vol=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag1_with_vol)),2)))
rms_lag5=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag5)),2)))
rms_lag5_with_vol=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag5_with_vol)),2)))

print('rms_lag1: ', rms_lag1)
print('rms_lag1_with_vol: ', rms_lag1_with_vol)
print('rms_lag5: ', rms_lag5)
print('rms_lag5_with_vol: ', rms_lag5_with_vol)

In [None]:
# plot 1 step Close only
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['predictions_lag1'] = preds_lag1

plt.plot(df_results['ground_truth'])
plt.plot(df_results['predictions_lag1'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','predictions_lag1'])
plt.show()

In [None]:
# plot 1 step Close and Vol
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['preds_lag1_with_vol'] = preds_lag1_with_vol

plt.plot(df_results['ground_truth'])
plt.plot(df_results['preds_lag1_with_vol'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','preds_lag1_with_vol'])
plt.show()

In [None]:
# Plot 5 steps Close only
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['predictions_lag5'] = preds_lag5

plt.plot(df_results['ground_truth'])
plt.plot(df_results['predictions_lag5'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','predictions_lag5'])
plt.show()

In [None]:
# Plot 5 steps Close and Vol
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['preds_lag5_with_vol'] = preds_lag5_with_vol

plt.plot(df_results['ground_truth'])
plt.plot(df_results['preds_lag5_with_vol'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','preds_lag5_with_vol'])
plt.show()

## Weekly

In [None]:
df_weekly = df.resample('W').mean()
df_weekly

In [None]:
plt.plot(df_weekly['Close'], label='Close Price Weekly')

In [None]:
df_processed = df_weekly.copy()

# leave Close only
df_processed = df_processed[['Close', 'Volume']]

num_lags = 5 # number of lags and window lenghts for mean aggregation
delay = 1 # predict target one step ahead
for column in df_processed:
    for lag in range(1,num_lags+1):
        df_processed[column + '_lag' + str(lag)] = df_processed[column].shift(lag*-1-(delay-1))      

df_processed.drop('Volume', axis=1)
df_processed.dropna(inplace=True)

df_processed.reset_index(drop=True,inplace=True)
df_train = df_processed.loc[:int(df_processed.shape[0]*0.7),:]
df_test = df_processed.loc[int(df_processed.shape[0]*0.7):,:]

# 1 step Close only
x_train_lag1 = df_train[['Close_lag1']]

# 1 step Close and Vol
mask_lag1_with_vol = df_train.columns.str.contains('lag1')
x_train_lag1_with_vol = df_train[df_train.columns[mask_lag1_with_vol]]

# 5 steps Close only
mask_lag5 = df_train.columns.str.contains('Close_lag')
x_train_lag5 = df_train[df_train.columns[mask_lag5]]

# 5 step Close and Vol
x_train_lag5_with_vol = df_train.drop('Close', axis=1)

y_train = df_train['Close']

### Test Data ###
# 1 step Close only
x_test_lag1 = df_test[['Close_lag1']]

# 1 step Close and Vol
x_test_lag1_with_vol = df_test[df_test.columns[mask_lag1_with_vol]]

# 5 steps Close only
x_test_lag5 = df_test[df_test.columns[mask_lag5]]

# 5 step Close and Vol
x_test_lag5_with_vol = df_test.drop('Close', axis=1)

y_test = df_test['Close']

# implement linear regression
from sklearn.linear_model import LinearRegression
# 1 step Close only
model_lag1 = LinearRegression()
model_lag1.fit(x_train_lag1, y_train)

# 1 step Close and Vol
model_lag1_with_vol = LinearRegression()
model_lag1_with_vol.fit(x_train_lag1_with_vol, y_train)

# 5 steps Close only
model_lag5 = LinearRegression()
model_lag5.fit(x_train_lag5, y_train)

# 5 steps Close and Vol
model_lag5_with_vol = LinearRegression()
model_lag5_with_vol.fit(x_train_lag5_with_vol, y_train)

# make predictions and find the rmse
preds_lag1 = model_lag1.predict(x_test_lag1)
preds_lag1_with_vol = model_lag1_with_vol.predict(x_test_lag1_with_vol)
preds_lag5 = model_lag5.predict(x_test_lag5)
preds_lag5_with_vol = model_lag5_with_vol.predict(x_test_lag5_with_vol)

rms_lag1=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag1)),2)))
rms_lag1_with_vol=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag1_with_vol)),2)))
rms_lag5=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag5)),2)))
rms_lag5_with_vol=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag5_with_vol)),2)))

print('rms_lag1: ', rms_lag1)
print('rms_lag1_with_vol: ', rms_lag1_with_vol)
print('rms_lag5: ', rms_lag5)
print('rms_lag5_with_vol: ', rms_lag5_with_vol)

In [None]:
# plot 1 step Close only
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['predictions_lag1'] = preds_lag1

plt.plot(df_results['ground_truth'])
plt.plot(df_results['predictions_lag1'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','predictions_lag1'])
plt.show()

# plot 1 step Close and Vol
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['preds_lag1_with_vol'] = preds_lag1_with_vol

plt.plot(df_results['ground_truth'])
plt.plot(df_results['preds_lag1_with_vol'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','preds_lag1_with_vol'])
plt.show()

# Plot 5 steps Close only
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['predictions_lag5'] = preds_lag5

plt.plot(df_results['ground_truth'])
plt.plot(df_results['predictions_lag5'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','predictions_lag5'])
plt.show()

# Plot 5 steps Close and Vol
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['preds_lag5_with_vol'] = preds_lag5_with_vol

plt.plot(df_results['ground_truth'])
plt.plot(df_results['preds_lag5_with_vol'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','preds_lag5_with_vol'])
plt.show()

## Monthly

In [None]:
df_monthly = df.resample('M').mean()
df_monthly

In [None]:
plt.plot(df_monthly['Close'], label='Close Price Monthly')

In [None]:
df_processed = df_monthly.copy()

# leave Close only
df_processed = df_processed[['Close', 'Volume']]

num_lags = 5 # number of lags and window lenghts for mean aggregation
delay = 1 # predict target one step ahead
for column in df_processed:
    for lag in range(1,num_lags+1):
        df_processed[column + '_lag' + str(lag)] = df_processed[column].shift(lag*-1-(delay-1))      

df_processed.drop('Volume', axis=1)
df_processed.dropna(inplace=True)

df_processed.reset_index(drop=True,inplace=True)
df_train = df_processed.loc[:int(df_processed.shape[0]*0.7),:]
df_test = df_processed.loc[int(df_processed.shape[0]*0.7):,:]

# 1 step Close only
x_train_lag1 = df_train[['Close_lag1']]

# 1 step Close and Vol
mask_lag1_with_vol = df_train.columns.str.contains('lag1')
x_train_lag1_with_vol = df_train[df_train.columns[mask_lag1_with_vol]]

# 5 steps Close only
mask_lag5 = df_train.columns.str.contains('Close_lag')
x_train_lag5 = df_train[df_train.columns[mask_lag5]]

# 5 step Close and Vol
x_train_lag5_with_vol = df_train.drop('Close', axis=1)

y_train = df_train['Close']

### Test Data ###
# 1 step Close only
x_test_lag1 = df_test[['Close_lag1']]

# 1 step Close and Vol
x_test_lag1_with_vol = df_test[df_test.columns[mask_lag1_with_vol]]

# 5 steps Close only
x_test_lag5 = df_test[df_test.columns[mask_lag5]]

# 5 step Close and Vol
x_test_lag5_with_vol = df_test.drop('Close', axis=1)

y_test = df_test['Close']

# implement linear regression
from sklearn.linear_model import LinearRegression
# 1 step Close only
model_lag1 = LinearRegression()
model_lag1.fit(x_train_lag1, y_train)

# 1 step Close and Vol
model_lag1_with_vol = LinearRegression()
model_lag1_with_vol.fit(x_train_lag1_with_vol, y_train)

# 5 steps Close only
model_lag5 = LinearRegression()
model_lag5.fit(x_train_lag5, y_train)

# 5 steps Close and Vol
model_lag5_with_vol = LinearRegression()
model_lag5_with_vol.fit(x_train_lag5_with_vol, y_train)

# make predictions and find the rmse
preds_lag1 = model_lag1.predict(x_test_lag1)
preds_lag1_with_vol = model_lag1_with_vol.predict(x_test_lag1_with_vol)
preds_lag5 = model_lag5.predict(x_test_lag5)
preds_lag5_with_vol = model_lag5_with_vol.predict(x_test_lag5_with_vol)

rms_lag1=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag1)),2)))
rms_lag1_with_vol=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag1_with_vol)),2)))
rms_lag5=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag5)),2)))
rms_lag5_with_vol=np.sqrt(np.mean(np.power((np.array(y_test)-np.array(preds_lag5_with_vol)),2)))

print('rms_lag1: ', rms_lag1)
print('rms_lag1_with_vol: ', rms_lag1_with_vol)
print('rms_lag5: ', rms_lag5)
print('rms_lag5_with_vol: ', rms_lag5_with_vol)

In [None]:
# plot 1 step Close only
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['predictions_lag1'] = preds_lag1

plt.plot(df_results['ground_truth'])
plt.plot(df_results['predictions_lag1'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','predictions_lag1'])
plt.show()

# plot 1 step Close and Vol
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['preds_lag1_with_vol'] = preds_lag1_with_vol

plt.plot(df_results['ground_truth'])
plt.plot(df_results['preds_lag1_with_vol'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','preds_lag1_with_vol'])
plt.show()

# Plot 5 steps Close only
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['predictions_lag5'] = preds_lag5

plt.plot(df_results['ground_truth'])
plt.plot(df_results['predictions_lag5'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','predictions_lag5'])
plt.show()

# Plot 5 steps Close and Vol
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['Close'].reset_index(drop=True)
df_results['preds_lag5_with_vol'] = preds_lag5_with_vol

plt.plot(df_results['ground_truth'])
plt.plot(df_results['preds_lag5_with_vol'])

plt.ylabel('Close',fontsize=18)
plt.legend(['ground_truth','preds_lag5_with_vol'])
plt.show()