In [1]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import read_quote      as rq
import stock_functions as sf
import df_visualizations as dv
import remap_values as rv

import sys
import time

In [94]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor

In [2]:
roll_nums = [5,15]#[3,5,7,10,12,15,18,20,22,25,28,30] # Generate an algorithm for predicting every few days
mom_nums  = [3,5,10,20,30]                     # Momentum has many good tracers
rsi_nums  = [10,15]                            # Good for some long term trends
band_nums = [5,10,15,20,25]                    # A few trace different areas well

In [128]:
#inpFileList = ['aapl','ba','farm','hes','ibm','mas','sbux','tgt']
inpFileList = ['tgt','aapl','ba','farm','hes','ibm','mas','sbux']

In [129]:
rf_estimators_base = 10 # Increase number of estimators each time
rf_estimators      = rf_estimators_base

nn_reg = MLPRegressor         ( hidden_layer_sizes=200, warm_start=True )
rf_reg = RandomForestRegressor( n_estimators=rf_estimators, min_samples_split=10, warm_start=True )
ba_reg = BaggingRegressor     ( n_estimators=rf_estimators, warm_start=True)
#en_reg = ElasticNet           ( warm_start= True )

# Loop over inpFileList, so multiple stocks being trained
for inpFile in inpFileList:

    fileName = 'quotes/' + inpFile + '.csv'


    print inpFile

    print '\tReading quote: ', fileName

    my_quote = rq.readQuote( fileName )

    print '\tDone.\n'


    ##################################################
    ############Generate Variables####################
    ##################################################

    print '\tGenerating variables...'

    # The variables we will use for the training data
    diffs = sf.generate_differentials   ( my_quote            ).drop('diff_v',axis=1)
    moms  = sf.generate_momentum_close  ( my_quote, mom_nums  )
    rsis  = sf.generate_rsi             ( my_quote, rsi_nums  )
    bands = sf.generate_bollinger_bands ( my_quote, band_nums )


    print '\tDone.\n'

    ##################################################
    #############Normalize Variables##################
    ##################################################

    print '\tNormalizing variables...'

    # Differentials in a day can be smart scaled
    diffs['diff_hl'] = np.log10( diffs['diff_hl'] )
    for col in diffs.columns:
        diffs[col] = rv.smart_scale( diffs, col, show_plot=False )

    # Momentums can also be smart scaled
    for col in moms.columns:
        moms[col] = rv.smart_scale( moms, col, show_plot=False )

    # RSIs have natural distribution centered at 0.5, scale accordingly
    for col in rsis.columns:
        rsis[col] = ( rsis[col] - 0.5 ) / rsis[col].std()

    # Bands also centered at 0.5, use stdev of bollinger band of 0.25 for scaling
    for col in bands.columns:
        bands[col] = ( bands[col] - 0.5 ) / 0.5


    print '\tDone.\n'

    var_df_list = [ diffs, moms, rsis, bands ]
    all_train_variables = reduce( lambda left,right: left.join(right,how='inner'), var_df_list )


    ##################################################
    #############Generate Target Variables############
    ##################################################


    print '\tGenerating target variables...'

    # The target variables are stored in the data frame
    rolls = sf.generate_rolling_close   ( my_quote, roll_nums, onlyMean=True )
    for i in roll_nums:
        rolls['close_mean_'+str(i)] = ( rolls['close_mean_'+str(i)].shift(i) / rolls['close_mean_'+str(i)] - 1 )
    rolls = rolls.replace( [np.inf, -np.inf], np.nan )


    # The predicted value column heads
    target_list = rolls.columns.values


    print '\tDone.\n'


    ##################################################
    ##########Train the model on all but last#########
    ##################################################


    # Loop over target list and train multiple machine learning methods on it:
    #for target_col in target_list:

    # Generate a target values and variable values
    target_col = target_list[-1]

    # Drops rows containing na, and reverse order for training/testing
    big_df = rolls[target_col].dropna().to_frame().join( all_train_variables, how='inner' )
    big_df = big_df[::-1]

    # Break up target and variables, not really a train_x/test_x since training over whole datasets
    target_values   = big_df[target_col].values
    variable_values = big_df.drop( target_col, axis=1 ).values

    # Only fit using all but last data
    if ( inpFile != inpFileList[-1] ):
    
        print '\tTraining data...'

        rf_reg.set_params( n_estimators=rf_estimators )
        ba_reg.set_params( n_estimators=rf_estimators )
        
        rf_estimators = rf_estimators + rf_estimators_base
        
        nn_reg.fit( variable_values, target_values )
        rf_reg.fit( variable_values, target_values )
        ba_reg.fit( variable_values, target_values )
#        en_reg.fit( variable_values, target_values )

        print '\tDone.'

        print 'Neural Network internal accuracy of : %7.4f' % ( nn_reg.score( variable_values, target_values ) )
        print 'Random Forest  internal accuracy of : %7.4f' % ( rf_reg.score( variable_values, target_values ) )
        print 'Bagging        internal accuracy of : %7.4f' % ( ba_reg.score( variable_values, target_values ) )
#        print 'Elastic Net    internal accuracy of : %7.4f' % ( en_reg.score( variable_values, target_values ) )
        print ''
        
    else:
       print 'Neural Network fit accuracy: %7.4f' % nn_reg.score( variable_values, target_values )
       print 'Random Forest  fit accuracy: %7.4f' % rf_reg.score( variable_values, target_values )
       print 'Bagging        fit accuracy: %7.4f' % ba_reg.score( variable_values, target_values )
#       print 'Elastic Net    fit accuracy: %7.4f' % en_reg.score( variable_values, target_values )

tgt
	Reading quote:  quotes/tgt.csv
	Done.

	Generating variables...
	Done.

	Normalizing variables...
	Done.

	Generating target variables...
	Done.

	Training data...
	Done.
Neural Network internal accuracy of :  0.1460
Random Forest  internal accuracy of :  0.8456
Bagging        internal accuracy of :  0.9033

aapl
	Reading quote:  quotes/aapl.csv
	Done.

	Generating variables...
	Done.

	Normalizing variables...
	Done.

	Generating target variables...
	Done.

	Training data...
	Done.
Neural Network internal accuracy of :  0.2119
Random Forest  internal accuracy of :  0.6690
Bagging        internal accuracy of :  0.7175

ba
	Reading quote:  quotes/ba.csv
	Done.

	Generating variables...
	Done.

	Normalizing variables...
	Done.

	Generating target variables...
	Done.

	Training data...
	Done.
Neural Network internal accuracy of : -0.6207
Random Forest  internal accuracy of :  0.5805
Bagging        internal accuracy of :  0.6157

farm
	Reading quote:  quotes/farm.csv
	Done.

	Generati

In [130]:
plot_frame = pd.DataFrame( {'true':target_values} )
plot_frame['random_forest' ] = rf_reg.predict( variable_values ) - plot_frame['true']
plot_frame['neural_network'] = nn_reg.predict( variable_values ) - plot_frame['true']
plot_frame['bagging'       ] = ba_reg.predict( variable_values ) - plot_frame['true']
plot_frame['average'       ] = (plot_frame['random_forest' ]+
                                plot_frame['bagging'       ]+
                                plot_frame['neural_network'])/3

In [131]:
ll   = 0.5
reg1 = 'random_forest'
reg2 = 'neural_network'
reg3 = 'bagging'
reg4 = 'average'

ax1 = plot_frame.plot(kind='scatter', x='true', y=reg1, color='g', alpha=0.3, label=reg1 )    
ax2 = plot_frame.plot(kind='scatter', x='true', y=reg2, color='r', alpha=0.3, label=reg2, ax=ax1)
ax3 = plot_frame.plot(kind='scatter', x='true', y=reg3, color='b', alpha=0.3, label=reg3, ax=ax1)
ax4 = plot_frame.plot(kind='scatter', x='true', y=reg4, color='k', alpha=0.3, label=reg4, ax=ax1)

#ax1.plot( [-ll,ll], [-ll,ll], color='k' )
ax1.set_ybound( [-ll,ll] )
ax1.set_xlabel( 'True Value' )
ax1.set_ylabel( 'predicted-true' )
ax1.legend( loc=2 )

plt.show()

In [44]:
reload(rv)

<module 'remap_values' from 'remap_values.py'>

In [51]:
my_grid = {'n_estimators':[100,150],
           'min_samples_split':[2,5,10]}
rv.optimize_timeseries_reg( RandomForestRegressor(), variable_values, target_values, my_grid )

Fold  1 accuracy: 0.3860  ,  -0.9372  {'min_samples_split': 10, 'n_estimators': 150}
Fold  2 accuracy: -1.0783  ,  -0.6193  {'min_samples_split': 10, 'n_estimators': 100}
Fold  3 accuracy: 0.0409  ,  -0.0810  {'min_samples_split': 5, 'n_estimators': 150}
Fold  4 accuracy: -0.4852  ,  0.2282  {'min_samples_split': 10, 'n_estimators': 150}
Fold  5 accuracy: 0.4500  ,  0.1066  {'min_samples_split': 2, 'n_estimators': 100}
Fold  6 accuracy: -1.0947  ,  -0.0031  {'min_samples_split': 5, 'n_estimators': 100}
Fold  7 accuracy: 0.2525  ,  0.2529  {'min_samples_split': 5, 'n_estimators': 100}
Fold  8 accuracy: -0.3233  ,  0.2456  {'min_samples_split': 2, 'n_estimators': 150}
Fold  9 accuracy: -0.0040  ,  0.3159  {'min_samples_split': 10, 'n_estimators': 150}
Fold 10 accuracy: 0.1032  ,  0.3911  {'min_samples_split': 5, 'n_estimators': 150}
 
Found  7  unique parameter combinations
 
Clf  0 Final Accuracy: 0.1589 +/- 0.5585
Clf  1 Final Accuracy: 0.1555 +/- 0.5519
Clf  2 Final Accuracy: 0.1228 +

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=5, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [52]:
my_grid = {'fit_intercept':[True,False],
           'l1_ratio':[0.1,0.3,0.5,0.7,0.9]}
rv.optimize_timeseries_reg( ElasticNet(), variable_values, target_values, my_grid )

Fold  1 accuracy: 0.3612  ,  -1.0870  {'l1_ratio': 0.1, 'fit_intercept': False}
Fold  2 accuracy: -1.7827  ,  -0.3216  {'l1_ratio': 0.1, 'fit_intercept': False}
Fold  3 accuracy: 0.2242  ,  -0.3600  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  4 accuracy: 0.0605  ,  0.1006  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  5 accuracy: 0.0573  ,  0.0787  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  6 accuracy: -2.2429  ,  -0.0640  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  7 accuracy: -0.0003  ,  0.0499  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  8 accuracy: -0.0546  ,  0.0565  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold  9 accuracy: -1.1303  ,  -0.0286  {'l1_ratio': 0.1, 'fit_intercept': True}
Fold 10 accuracy: -0.0031  ,  0.0199  {'l1_ratio': 0.1, 'fit_intercept': True}
 
Found  2  unique parameter combinations
 
Clf  0 Final Accuracy: -0.2330 +/- 0.5800
Clf  1 Final Accuracy: -0.0031 +/- 0.7661
 
Using CLF with accuracy:   0.000000
CLF params:  {'normalize': False, 'warm_s

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=False, l1_ratio=0.1,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [53]:
my_grid = {'hidden_layer_sizes':[100,150,200],
           'alpha':10**np.arange(-5,-3,0.5)}
rv.optimize_timeseries_reg( MLPRegressor(), variable_values, target_values, my_grid )

Fold  1 accuracy: -0.2858  ,  -1.3955  {'alpha': 0.0001, 'hidden_layer_sizes': 100}
Fold  2 accuracy: -2.0755  ,  -1.6481  {'alpha': 3.1622776601683795e-05, 'hidden_layer_sizes': 150}
Fold  3 accuracy: -0.3176  ,  -0.4443  {'alpha': 3.1622776601683795e-05, 'hidden_layer_sizes': 200}
Fold  4 accuracy: -1.9789  ,  -0.0365  {'alpha': 0.0001, 'hidden_layer_sizes': 200}
Fold  5 accuracy: 0.2285  ,  -0.2336  {'alpha': 1.0000000000000001e-05, 'hidden_layer_sizes': 200}
Fold  6 accuracy: -0.8961  ,  -0.0650  {'alpha': 3.1622776601683795e-05, 'hidden_layer_sizes': 200}
Fold  7 accuracy: -0.4071  ,  0.0873  {'alpha': 0.0001, 'hidden_layer_sizes': 150}
Fold  8 accuracy: -1.0916  ,  0.0813  {'alpha': 0.0001, 'hidden_layer_sizes': 200}
Fold  9 accuracy: -0.1278  ,  0.1539  {'alpha': 0.00031622776601683794, 'hidden_layer_sizes': 200}
Fold 10 accuracy: -0.1185  ,  0.2360  {'alpha': 0.00031622776601683794, 'hidden_layer_sizes': 200}
 
Found  8  unique parameter combinations
 
Clf  0 Final Accuracy: -0

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=100, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)