Using a neural network to predict stock prices, using only basic data

In [7]:
%matplotlib inline

from matplotlib import pyplot as plt
import datetime
import pandas_datareader.data as web
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score

# increase default figure size for matplotlib
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10

In [8]:
# I will implement a forward distribution, to predict values for the future

def create_columns(df, days=7):
    columns = df.columns
    for n in range(1,days):
        for column in columns:
            new_column = "d{}-{}".format(n, column)
            df[new_column] = 0
    return df

def construct_features(df, days=7):
    columns = df.columns
    for n in range(1,days):
        for column in columns: 
            for row in range(df.shape[0]):
                column_to_update = "d{}-{}".format(n, column)
                if row+1 > n:
                    df.ix[row, column_to_update] = df.ix[row-n, column]
                else:
                    df.ix[row, column_to_update] = np.nan

    # drop existing features
    # df = df.drop(columns - ['Adj Close'], axis=1)
    
    #drop NAs
    df = df.dropna()
    
    return df

# train test split, non-randomized
def split(array, test_size):
    return array[:test_size], array[test_size:]

# scale
def scale(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

# classify
def train(X_train, y_train, reg=0.1):
    clf = Lasso(alpha=reg)
    clf.fit(X_train, y_train)
    return clf 

# score
def predict(clf, y_test):
    return clf.predict(y_test)


In [9]:
# import data from yahoo finance
start_date = datetime.datetime(2012,1,1)
end_date = datetime.datetime(2016,7,31) 
symbol = "HGTX3.SA"
df_base = web.DataReader(symbol, 'yahoo', start_date, end_date)

#start from scratch
df_base.columns

Index([u'Open', u'High', u'Low', u'Close', u'Volume', u'Adj Close'], dtype='object')

In [10]:
# transform dataframe
df = df_base.drop(['Open', 'High', 'Low', 'Close'], axis=1)
days=7
df = create_columns(df, days=days)
df = construct_features(df, days=days)
df.head()

Unnamed: 0_level_0,Volume,Adj Close,d1-Volume,d1-Adj Close,d2-Volume,d2-Adj Close,d3-Volume,d3-Adj Close,d4-Volume,d4-Adj Close,...,d6-d2-Volume,d6-d2-Adj Close,d6-d3-Volume,d6-d3-Adj Close,d6-d4-Volume,d6-d4-Adj Close,d6-d5-Volume,d6-d5-Adj Close,d6-d6-Volume,d6-d6-Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-19,1845400,32.42,1088600.0,31.62,730700.0,30.73,1893000.0,30.3,1428100.0,30.73,...,1034000.0,28.84,1248800.0,28.36,902900.0,27.75,1322800.0,28.03,1104600.0,27.82
2012-01-20,810900,32.42,1845400.0,32.42,1088600.0,31.62,730700.0,30.73,1893000.0,30.3,...,991900.0,29.67,1034000.0,28.84,1248800.0,28.36,902900.0,27.75,1322800.0,28.03
2012-01-23,1243800,32.38,810900.0,32.42,1845400.0,32.42,1088600.0,31.62,730700.0,30.73,...,2130600.0,30.55,991900.0,29.67,1034000.0,28.84,1248800.0,28.36,902900.0,27.75
2012-01-24,792900,32.47,1243800.0,32.38,810900.0,32.42,1845400.0,32.42,1088600.0,31.62,...,1214300.0,30.73,2130600.0,30.55,991900.0,29.67,1034000.0,28.84,1248800.0,28.36
2012-01-25,0,32.47,792900.0,32.47,1243800.0,32.38,810900.0,32.42,1845400.0,32.42,...,1428100.0,30.73,1214300.0,30.73,2130600.0,30.55,991900.0,29.67,1034000.0,28.84


In [11]:
# get X and y
df_short = df.iloc[:800]
X = df_short.drop(['Adj Close', 'Volume'], axis=1).values
y_price, y_volume = df_short['Adj Close'].values, df_short['Volume'].values

# separate test and train 
test_size = int(X.shape[0]*.9)
X_train, X_test = split(X, test_size)
y_price_train, y_price_test = split(y_price, test_size)
y_volume_train, y_volume_test = split(y_volume, test_size)

# train two classifiers, one for price, one for volume
clf_price = train(X_train, y_price_train, reg=0.2)
clf_volume = train(X_train, y_volume_train, reg=0.2)

X_pred = X_train
y_price_pred = np.array([])
y_volume_pred = np.array([])

for _ in y_price_test:
    # get the features
    x = X_pred[-1]

    # predict 
    price = clf_price.predict(x.reshape(1, -1))
    volume = clf_volume.predict(x.reshape(1, -1))
    
    # append to y values
    y_price_pred = np.append(y_price_pred, price)
    y_volume_pred = np.append(y_volume_pred, price)
    
    # Create a new row, add the predition values, and append to X
    x = X_pred[-1][:-2]
    x = np.append(price, x)
    x = np.append(volume, x)
    X_pred = np.append(X_pred, x.reshape(1, -1), axis=0)
    

NameError: global name 'Lasso' is not defined

In [None]:
# plot
full_pred = pd.DataFrame(np.append(y_price_train, y_price_pred), index=df_short.index)
base = pd.DataFrame(np.append(y_price_train, y_price_test), index=df_short.index)
ax = full_pred.plot(color='red')
base.plot(color='blue', ax=ax)

In [None]:
r2_score(y_price_test, y_price_pred), r2_score(y_volume_test, y_volume_pred)

In [None]:
zip(df.columns.difference(['Adj Close', 'Volume']), clf_price.coef_)

In [None]:
zip(y_price_test, y_price_pred)