Using a neural network to predict stock prices, using only basic data

In [11]:
%matplotlib inline

from matplotlib import pyplot as plt
import datetime
import pandas_datareader.data as web
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score

# increase default figure size for matplotlib
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10

from collections import defaultdict



In [115]:
start_date = datetime.datetime(2012,1,1)
end_date = datetime.datetime(2016,8,31) 
symbol = "PETR4.SA"
df = web.DataReader(symbol, 'yahoo', start_date, end_date)

In [116]:
# df.ix[:,'Adj Close'].plot()

In [117]:
# first I have to add a signal, either up or down, to each row
df['Delta'] = (df['Adj Close'] / df['Adj Close'].shift(1))-1
df['Signal'] = df['Delta']>0
df = df.dropna()
# df.head()

In [118]:
# done. now I need to do backward filling, then I'm ready to predict
backward = 30
for column in ['Delta', 'Volume']:
    for i in range(1,backward+1):
        new_column = "{} -d{}".format(column, i)
        for row in range(backward, df.shape[0]):
            df.ix[row, new_column] = df.ix[row-i, column]

In [119]:
df = df.dropna()
len(df.columns)

68

In [120]:
# deprecated: switch to 2 labels - buy or don't  buy
# look back for features, ahead for labeling
forward = 10
# boundaries
soft = .05

for row in range(df.shape[0]-forward):
    # first - construct forward 
    count_signals = 0
    max_uptick = 0
    min_downtick = 0 
    
    for i in range(1,forward+1):
        # capture signals
        count_signals += df.ix[row+i, 'Signal']
        delta = (df.ix[row+i, 'Adj Close'] / df.ix[row, 'Adj Close'])-1
        if delta > max_uptick:
            max_uptick = delta
        if delta < min_downtick:
            min_downtick = delta
        
        # convert to label
        signals = count_signals*1.0/forward
    
    # up
    if max_uptick >= soft and min_downtick <= -soft:
        df.ix[row,'Label'] = 1
    else:
        df.ix[row,'Label'] = 0                        
            

In [121]:
df['Label'].value_counts(normalize=True, sort=False)

0.0    0.906278
1.0    0.084881
Name: Label, dtype: float64

In [122]:
df = df.dropna()

In [123]:
# now to classification
# it is far from a minimal necessary
# but I will see what values can I get
# start with a simple tree


In [124]:
X = df.drop('Label', axis=1)
y = df['Label']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
y_pred_random = np.random.random_integers(low=0, high=1, size=len(y_test))

In [130]:
from sklearn.tree import DecisionTreeClassifier as Tree
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier as NN
from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

np.random.seed(33)
classifiers = [GBC(), Tree(), SVC(), GaussianNB(), NN(), kNN()]
# print accuracy_score(y_test, y_pred_random), f1_score(y_test, y_pred_random)
for clf in classifiers:    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print clf.__class__
    print "{:.2f}".format(precision_score(y_test, y_pred))
    print confusion_matrix(y_test, y_pred)

<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
0.33
[[304   4]
 [ 27   2]]
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
0.22
[[283  25]
 [ 22   7]]
<class 'sklearn.svm.classes.SVC'>
0.00
[[308   0]
 [ 29   0]]
<class 'sklearn.naive_bayes.GaussianNB'>
0.21
[[275  33]
 [ 20   9]]
<class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>
0.10
[[270  38]
 [ 25   4]]
<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
0.00
[[302   6]
 [ 29   0]]


In [126]:
# kNN and Gradient Boosting are good performers
# what happens if I follow this straight
# I will buy it. sell it when it values .05
# I will be right 60% of the time
# so I lose on 40%, win on 60%
# when I win, I win 0.05. when I lose, I loose 0.05
# it can't variate more than .05 down as well

In [131]:
# new challenge
# implement a qlearning algorithm
# in which the goal is to maximize the strategy parameters
# the strategy parameters are:
# how much days I wait to sell
# how  much up it will get
# how much down it will get

# implement an agent
# he will set a strategy
# that will be defined by trading period
# profit_realization and stop_loss parameters
# he will then train an algorithm
# and test its precision
# if it is greater than the minimum requirement, done, he will apply
# if not, he will update the parameters and try again

# he doesn't need to apply the strategy to see if it's effective
# he only needs to see the precision and he will know the outcome

# how does reward works?
# he know beforehand how much he is expect to lose if he follow the strategy

# what are the states?
# the state is a combination of these 3 parameters
# it is not a RL problem
# but I can solve with GA

class Strategy():
    
    def __init__(self, span = 10, stop_loss = .5, profit_margin = .5):
        self.span = span
        self.stop_loss = stop_loss
        self.profit_margin = profit_margin
    

class 
    
            

In [127]:
# deprecated: switch to 2 labels - buy or don't  buy
# look back for features, ahead for labeling
forward = 10
# boundaries
soft = df['Delta'].std()
hard = soft*2                     

for row in range(df.shape[0]-forward):
    # first - construct forward 
    count_signals = 0
    max_uptick = 0
    min_downtick = 0 
    
    for i in range(1,forward+1):
        # capture signals
        count_signals += df.ix[row+i, 'Signal']
        delta = (df.ix[row+i, 'Adj Close'] / df.ix[row, 'Adj Close'])-1
        if delta > max_uptick:
            max_uptick = delta
        if delta < min_downtick:
            min_downtick = delta
        
        # convert to label
        signals = count_signals*1.0/forward

    # test
    if row%123 == 0:
        print "count_signals: {}, signals: {:.1f}, max_uptick: {:.3f}, min_downtick: {:.3f}".format(
            count_signals, signals, max_uptick, min_downtick)
    
    # up
    if signals >= .8 and max_uptick >= hard:
        df.ix[row,'Label'] = 5
    elif (signals >= .7 and max_uptick >= soft) or max_uptick >= hard or signals >=.8:
        df.ix[row,'Label'] = 4
    # down
    elif signals <= .2 and min_downtick <= -hard:
        df.ix[row,'Label'] = 1
    elif (signals <= .3 and min_downtick <= -soft) or min_downtick <= -hard or signals <=.2:
        df.ix[row,'Label'] = 2
    # neutral
    else:
        df.ix[row,'Label'] = 3                        
            

count_signals: 4, signals: 0.4, max_uptick: 0.052, min_downtick: -0.012
count_signals: 3, signals: 0.3, max_uptick: 0.004, min_downtick: -0.040
count_signals: 7, signals: 0.7, max_uptick: 0.167, min_downtick: -0.007
count_signals: 6, signals: 0.6, max_uptick: 0.085, min_downtick: -0.007
count_signals: 4, signals: 0.4, max_uptick: 0.026, min_downtick: -0.066
count_signals: 5, signals: 0.5, max_uptick: 0.077, min_downtick: -0.081
count_signals: 4, signals: 0.4, max_uptick: 0.036, min_downtick: -0.083
count_signals: 6, signals: 0.6, max_uptick: 0.053, min_downtick: -0.111
count_signals: 4, signals: 0.4, max_uptick: 0.110, min_downtick: -0.068
count_signals: 5, signals: 0.5, max_uptick: 0.019, min_downtick: -0.052
