Using a neural network to predict stock prices, using only basic data

In [11]:
%matplotlib inline

from matplotlib import pyplot as plt
import datetime
import pandas_datareader.data as web
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score

# increase default figure size for matplotlib
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10

from collections import defaultdict



In [54]:
start_date = datetime.datetime(2014,1,1)
end_date = datetime.datetime(2016,8,31) 
symbol = "PETR4.SA"
df = web.DataReader(symbol, 'yahoo', start_date, end_date)

In [55]:
# df.ix[:,'Adj Close'].plot()

In [56]:
# first I have to add a signal, either up or down, to each row
df['Delta'] = (df['Adj Close'] / df['Adj Close'].shift(1))-1
df['Signal'] = df['Delta']>0
df = df.dropna()
# df.head()

In [57]:
# done. now I need to do backward filling, then I'm ready to predict
backward = 30
for column in ['Delta', 'Volume']:
    for i in range(1,backward+1):
        new_column = "{} -d{}".format(column, i)
        for row in range(backward, df.shape[0]):
            df.ix[row, new_column] = df.ix[row-i, column]

In [58]:
df = df.dropna()
len(df.columns)

68

In [59]:
# deprecated: switch to 2 labels - buy or don't  buy
# look back for features, ahead for labeling
forward = 5
# boundaries
soft = .05

for row in range(df.shape[0]-forward):
    # first - construct forward 
    count_signals = 0
    max_uptick = 0
    min_downtick = 0 
    
    for i in range(1,forward+1):
        # capture signals
        count_signals += df.ix[row+i, 'Signal']
        delta = (df.ix[row+i, 'Adj Close'] / df.ix[row, 'Adj Close'])-1
        if delta > max_uptick:
            max_uptick = delta
        if delta < min_downtick:
            min_downtick = delta
        
        # convert to label
        signals = count_signals*1.0/forward
    
    # up
    if max_uptick >= soft:
        df.ix[row,'Label'] = 1
    else:
        df.ix[row,'Label'] = 0                        
            

In [60]:
df['Label'].value_counts(normalize=True, sort=False)

0.0    0.612056
1.0    0.380216
Name: Label, dtype: float64

In [61]:
df = df.dropna()

In [62]:
# now to classification
# it is far from a minimal necessary
# but I will see what values can I get
# start with a simple tree


In [71]:
X = df.drop('Label', axis=1)
y = df['Label']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)
y_pred_random = np.random.random_integers(low=0, high=1, size=len(y_test))

In [86]:
from sklearn.tree import DecisionTreeClassifier as Tree
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier as NN
from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# np.random.seed(33)
classifiers = [GBC(), Tree(), SVC(), GaussianNB(), NN(), kNN()]
print accuracy_score(y_test, y_pred_random), f1_score(y_test, y_pred_random)
for clf in classifiers:    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print clf.__class__
    print accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

0.472868217054 0.392857142857
<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
0.604651162791 0.354430379747
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
0.550387596899 0.452830188679
<class 'sklearn.svm.classes.SVC'>
0.635658914729 0.0
<class 'sklearn.naive_bayes.GaussianNB'>
0.503875968992 0.428571428571
<class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>
0.527131782946 0.371134020619
<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
0.604651162791 0.4


In [None]:
# kNN and Gradient Boosting are good performers
# what happens if I follow this straight
# I will buy it. sell it when it values .05
# I will be right 60% of the time
# so I lose on 40%, win on 60%
# when I win, I win 0.05. when I lose, I loose 0.05
# it can't variate more than .05 down as well

In [17]:
# deprecated: switch to 2 labels - buy or don't  buy
# look back for features, ahead for labeling
forward = 10
# boundaries
soft = df['Delta'].std()
hard = soft*2                     

for row in range(df.shape[0]-forward):
    # first - construct forward 
    count_signals = 0
    max_uptick = 0
    min_downtick = 0 
    
    for i in range(1,forward+1):
        # capture signals
        count_signals += df.ix[row+i, 'Signal']
        delta = (df.ix[row+i, 'Adj Close'] / df.ix[row, 'Adj Close'])-1
        if delta > max_uptick:
            max_uptick = delta
        if delta < min_downtick:
            min_downtick = delta
        
        # convert to label
        signals = count_signals*1.0/forward

    # test
    if row%123 == 0:
        print "count_signals: {}, signals: {:.1f}, max_uptick: {:.3f}, min_downtick: {:.3f}".format(
            count_signals, signals, max_uptick, min_downtick)
    
    # up
    if signals >= .8 and max_uptick >= hard:
        df.ix[row,'Label'] = 5
    elif (signals >= .7 and max_uptick >= soft) or max_uptick >= hard or signals >=.8:
        df.ix[row,'Label'] = 4
    # down
    elif signals <= .2 and min_downtick <= -hard:
        df.ix[row,'Label'] = 1
    elif (signals <= .3 and min_downtick <= -soft) or min_downtick <= -hard or signals <=.2:
        df.ix[row,'Label'] = 2
    # neutral
    else:
        df.ix[row,'Label'] = 3                        
            

count_signals: 3, signals: 0.3, max_uptick: 0.000, min_downtick: -0.068
count_signals: 6, signals: 0.6, max_uptick: 0.168, min_downtick: 0.000
count_signals: 4, signals: 0.4, max_uptick: 0.065, min_downtick: -0.024
count_signals: 1, signals: 0.1, max_uptick: 0.000, min_downtick: -0.220
count_signals: 3, signals: 0.3, max_uptick: 0.052, min_downtick: -0.080
count_signals: 6, signals: 0.6, max_uptick: 0.021, min_downtick: -0.045
