In [1]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import read_quote      as rq
import stock_functions as sf
import df_visualizations as dv
import remap_values as rv

import sys
import time

In [2]:
roll_nums = [3,5,7,10,12,15,18,20,22,25,28,30] # Generate an algorithm for predicting every few days
mom_nums  = [3,5,10,20,30]                     # Momentum has many good tracers
rsi_nums  = [10,15]                            # Good for some long term trends
band_nums = [5,10,15,20,25]                    # A few trace different areas well

In [3]:
inpFileList = ['aapl','ba','farm','hes','ibm','mas','sbux','tgt']

In [4]:
# Loop over inpFileList, so multiple stocks being trained
#for inpFile in inpFileList:

In [5]:
fileName = 'quotes/' + inpFileList[0] + '.csv'

print 'Reading quote: ', fileName

my_quote = rq.readQuote( fileName )

print 'Done.\n'


##################################################
############Generate Variables####################
##################################################

print 'Generating variables...'

# The variables we will use for the training data
diffs = sf.generate_differentials   ( my_quote            ).drop('diff_v',axis=1)
moms  = sf.generate_momentum_close  ( my_quote, mom_nums  )
rsis  = sf.generate_rsi             ( my_quote, rsi_nums  )
bands = sf.generate_bollinger_bands ( my_quote, band_nums )


print 'Done.\n'

##################################################
#############Normalize Variables##################
##################################################

print 'Normalizing variables...'

# Differentials in a day can be smart scaled
diffs['diff_hl'] = np.log10( diffs['diff_hl'] )
for col in diffs.columns:
    diffs[col] = rv.smart_scale( diffs, col, show_plot=False )

# Momentums can also be smart scaled
for col in moms.columns:
    moms[col] = rv.smart_scale( moms, col, show_plot=False )
    
# RSIs have natural distribution centered at 0.5, scale accordingly
for col in rsis.columns:
    rsis[col] = ( rsis[col] - 0.5 ) / rsis[col].std()
    
# Bands also centered at 0.5, use stdev of bollinger band of 0.25 for scaling
for col in bands.columns:
    bands[col] = ( bands[col] - 0.5 ) / 0.5
    

print 'Done.\n'

var_df_list = [ diffs, moms, rsis, bands ]
all_train_variables = reduce( lambda left,right: left.join(right,how='inner'), var_df_list )


##################################################
#############Generate Target Variables############
##################################################



# The target variables are stored in the data frame
rolls = sf.generate_rolling_close   ( my_quote, roll_nums, onlyMean=True )
for i in roll_nums:
    rolls['close_mean_'+str(i)] = ( rolls['close_mean_'+str(i)].shift(i) / rolls['close_mean_'+str(i)] - 1 )
rolls = rolls.replace( [np.inf, -np.inf], np.nan )


# The predicted value column heads
target_list = rolls.columns.values


# Loop over target list and train multiple machine learning methods on it:
#for target_col in target_list:

Reading quote:  quotes/aapl.csv
Done.

Generating variables...
Done.

Normalizing variables...
Done.



In [49]:
target_col = target_list[-1]

big_df = rolls[target_col].dropna().to_frame().join( all_train_variables, how='inner' )
big_df = big_df[::-1]

target_values   = big_df[target_col].values
variable_values = big_df.drop( target_col, axis=1 ).values

In [50]:
big_df.head()

Unnamed: 0_level_0,close_mean_30,diff_co,diff_hl,momentum_3,momentum_5,momentum_10,momentum_20,momentum_30,rsi_10,rsi_15,bollinger_5,bollinger_10,bollinger_15,bollinger_20,bollinger_25
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2007/04/23,0.171045,1.750816,0.574995,1.12812,0.400426,-0.299621,-0.351259,-0.385663,-0.031876,0.125007,0.87614,0.649741,0.304775,0.169085,0.185976
2007/04/24,0.180585,-0.677285,1.962264,1.071768,0.649879,-0.466712,-0.699728,0.014442,-0.237591,-0.088544,0.49608,0.674522,0.236426,0.140652,0.082452
2007/04/25,0.187966,0.975853,-0.228525,1.643102,1.264052,0.267156,-0.364984,0.452949,0.620872,0.16694,0.652888,1.018127,0.799388,0.787826,0.662723
2007/04/26,0.193398,-2.311491,1.452117,1.976169,2.350031,1.034394,0.42157,0.662458,1.154375,0.733335,0.758913,1.124311,1.260867,1.361793,1.360417
2007/04/27,0.196227,1.471979,0.346209,2.525957,2.44315,1.671056,0.495442,0.847363,1.796068,0.810014,0.611901,0.91283,1.130042,1.251044,1.314632


In [45]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor

In [44]:
reload(rv)

<module 'remap_values' from 'remap_values.py'>

In [51]:
my_grid = {'n_estimators':[100,150],
           'min_samples_split':[2,5,10]}
rv.optimize_timeseries_reg( RandomForestRegressor(), variable_values, target_values, my_grid )

Fold  1 accuracy: 0.3860  ,  -0.9372  {'min_samples_split': 10, 'n_estimators': 150}
Fold  2 accuracy: -1.0783  ,  -0.6193  {'min_samples_split': 10, 'n_estimators': 100}
Fold  3 accuracy: 0.0409  ,  -0.0810  {'min_samples_split': 5, 'n_estimators': 150}
Fold  4 accuracy: -0.4852  ,  0.2282  {'min_samples_split': 10, 'n_estimators': 150}
Fold  5 accuracy: 0.4500  ,  0.1066  {'min_samples_split': 2, 'n_estimators': 100}
Fold  6 accuracy: -1.0947  ,  -0.0031  {'min_samples_split': 5, 'n_estimators': 100}
Fold  7 accuracy: 0.2525  ,  0.2529  {'min_samples_split': 5, 'n_estimators': 100}
Fold  8 accuracy: -0.3233  ,  0.2456  {'min_samples_split': 2, 'n_estimators': 150}
Fold  9 accuracy: -0.0040  ,  0.3159  {'min_samples_split': 10, 'n_estimators': 150}
Fold 10 accuracy: 0.1032  ,  0.3911  {'min_samples_split': 5, 'n_estimators': 150}
 
Found  7  unique parameter combinations
 
Clf  0 Final Accuracy: 0.1589 +/- 0.5585
Clf  1 Final Accuracy: 0.1555 +/- 0.5519
Clf  2 Final Accuracy: 0.1228 +

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=5, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [52]:
my_grid = {'fit_intercept':[True,False],
           'l1_ratio':[0.1,0.3,0.5,0.7,0.9]}
rv.optimize_timeseries_reg( ElasticNet(), variable_values, target_values, my_grid )

Fold  1 accuracy: 0.3612  ,  -1.0870  {'l1_ratio': 0.1, 'fit_intercept': False}
Fold  2 accuracy: -1.7827  ,  -0.3216  {'l1_ratio': 0.1, 'fit_intercept': False}
Fold  3 accuracy: 0.2242  ,  -0.3600  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  4 accuracy: 0.0605  ,  0.1006  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  5 accuracy: 0.0573  ,  0.0787  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  6 accuracy: -2.2429  ,  -0.0640  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  7 accuracy: -0.0003  ,  0.0499  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  8 accuracy: -0.0546  ,  0.0565  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  9 accuracy: -1.1303  ,  -0.0286  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold 10 accuracy: -0.0031  ,  0.0199  {'l1_ratio': 0.1, 'fit_intercept': True}
 
Found  2  unique parameter combinations
 
Clf  0 Final Accuracy: -0.2330 +/- 0.5800
Clf  1 Final Accuracy: -0.0031 +/- 0.7661
 
Using CLF with accuracy:   0.000000
CLF params:  {'normalize': False, 'warm_s

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=False, l1_ratio=0.1,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [53]:
my_grid = {'hidden_layer_sizes':[100,150,200],
           'alpha':10**np.arange(-5,-3,0.5)}
rv.optimize_timeseries_reg( MLPRegressor(), variable_values, target_values, my_grid )

Fold  1 accuracy: -0.2858  ,  -1.3955  {'alpha': 0.0001, 'hidden_layer_sizes': 100}
Fold  2 accuracy: -2.0755  ,  -1.6481  {'alpha': 3.1622776601683795e-05, 'hidden_layer_sizes': 150}
Fold  3 accuracy: -0.3176  ,  -0.4443  {'alpha': 3.1622776601683795e-05, 'hidden_layer_sizes': 200}
Fold  4 accuracy: -1.9789  ,  -0.0365  {'alpha': 0.0001, 'hidden_layer_sizes': 200}
Fold  5 accuracy: 0.2285  ,  -0.2336  {'alpha': 1.0000000000000001e-05, 'hidden_layer_sizes': 200}
Fold  6 accuracy: -0.8961  ,  -0.0650  {'alpha': 3.1622776601683795e-05, 'hidden_layer_sizes': 200}
Fold  7 accuracy: -0.4071  ,  0.0873  {'alpha': 0.0001, 'hidden_layer_sizes': 150}
Fold  8 accuracy: -1.0916  ,  0.0813  {'alpha': 0.0001, 'hidden_layer_sizes': 200}
Fold  9 accuracy: -0.1278  ,  0.1539  {'alpha': 0.00031622776601683794, 'hidden_layer_sizes': 200}
Fold 10 accuracy: -0.1185  ,  0.2360  {'alpha': 0.00031622776601683794, 'hidden_layer_sizes': 200}
 
Found  8  unique parameter combinations
 
Clf  0 Final Accuracy: -0

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=100, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)