### `ml-tinkering` 1: Linear Regression
##### Using 20 years of stock price data, we fit a linear regression model to predict the close price of AMZN roughly 50 days into the future.

In [53]:
import pandas as pd
import numpy as np
import quandl
import math
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [54]:
df = quandl.get('WIKI/AMZN')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1997-05-16,22.38,23.75,20.5,20.75,1225000.0,0.0,1.0,1.865,1.979167,1.708333,1.729167,14700000.0
1997-05-19,20.5,21.25,19.5,20.5,508900.0,0.0,1.0,1.708333,1.770833,1.625,1.708333,6106800.0
1997-05-20,20.75,21.0,19.63,19.63,455600.0,0.0,1.0,1.729167,1.75,1.635833,1.635833,5467200.0
1997-05-21,19.25,19.75,16.5,17.13,1571100.0,0.0,1.0,1.604167,1.645833,1.375,1.4275,18853200.0
1997-05-22,17.25,17.38,15.75,16.75,981400.0,0.0,1.0,1.4375,1.448333,1.3125,1.395833,11776800.0


Do some simple data processing. Calculate the high/low percentage difference and the open/close percentage change.

In [55]:
# columns we want to keep
columns = {
    'Adj. Low':    'adj_lo',
    'Adj. High':   'adj_hi',
    'Adj. Open':   'adj_open',
    'Adj. Close':  'adj_close',
    'Adj. Volume': 'adj_vol',
}

# drop and rename columns
df = df[columns.keys()].rename(columns=columns)

# add two columns: high/low percentage difference and open/close percentage change
df['hl_pct'] = (df['adj_hi'] - df['adj_close']) / df['adj_close'] * 100
df['pct_change'] = (df['adj_close'] - df['adj_open']) / df['adj_open'] * 100

In [56]:
df.head()

Unnamed: 0_level_0,adj_lo,adj_hi,adj_open,adj_close,adj_vol,hl_pct,pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1997-05-16,1.708333,1.979167,1.865,1.729167,14700000.0,14.457831,-7.283289
1997-05-19,1.625,1.770833,1.708333,1.708333,6106800.0,3.658537,0.0
1997-05-20,1.635833,1.75,1.729167,1.635833,5467200.0,6.979114,-5.39759
1997-05-21,1.375,1.645833,1.604167,1.4275,18853200.0,15.294804,-11.012987
1997-05-22,1.3125,1.448333,1.4375,1.395833,11776800.0,3.761194,-2.898551


What we want to predict (i.e. our label):

In [57]:
forecast_col = 'adj_close'

Fill missing values with garbage:

In [58]:
# fill missing values with unary inverse of 99999
df.fillna(~99999, inplace=True)

How many days out we will be forecasting:

In [59]:
forecast_out = int(math.ceil(0.01 * len(df)))
forecast_out

53

In [60]:
df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,adj_lo,adj_hi,adj_open,adj_close,adj_vol,hl_pct,pct_change,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1997-05-16,1.708333,1.979167,1.865,1.729167,14700000.0,14.457831,-7.283289,2.416667
1997-05-19,1.625,1.770833,1.708333,1.708333,6106800.0,3.658537,0.0,2.3125
1997-05-20,1.635833,1.75,1.729167,1.635833,5467200.0,6.979114,-5.39759,2.208333
1997-05-21,1.375,1.645833,1.604167,1.4275,18853200.0,15.294804,-11.012987,2.25
1997-05-22,1.3125,1.448333,1.4375,1.395833,11776800.0,3.761194,-2.898551,2.1775


Configure features and labels:

In [61]:
X = np.array(df.drop(['label'],1))
y = np.array(df['label'])
X = preprocessing.scale(X)

Configure testing and training set (we will use 20% of the data as the 'test' set)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)

Compile and run the model:

In [63]:
clf = LinearRegression()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
accuracy

0.9771685944875638