In [3]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import read_quote      as rq
import stock_functions as sf
import df_visualizations as dv
import remap_values as rv

import random

import sys
import time

In [4]:
from sklearn.svm            import SVR
from sklearn.ensemble       import RandomForestRegressor
from sklearn.linear_model   import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble       import BaggingRegressor

In [5]:
roll_nums = [5,15]#[3,5,7,10,12,15,18,20,22,25,28,30] # Generate an algorithm for predicting every few days
mom_nums  = [3,5,10,20,30]                     # Momentum has many good tracers
rsi_nums  = [10,15]                            # Good for some long term trends
band_nums = [5,10,15,20,25]                    # A few trace different areas well

In [6]:
inpFileList = ['aapl','acm' ,'awk' ,'amzn','awr' ,'ba'  ,'bac' ,'c'   ,'cat' ,'cop' ,'cvx' ,'dal' ,
               'dd'  ,'farm','fdp' ,'gnc' ,'hes' ,'ibm' ,'mas' ,'mcd' ,'mon' ,'msex','msft','nflx',
               'sbux','strl','tgt' ,'tsla','ups' ,'xom' ,'xpo' ,'vmc' ]

In [7]:
cat_dict = \
{    
    'aapl':'comp',
    'acm' :'cons',
    'amzn':'csmr',
    'awk' :'wate',
    'awr' :'wate',
    'ba'  :'aero',
    'bac' :'fina',
    'c'   :'fina',
    'cat' :'cons',
    'cop' :'ener',
    'cvx' :'ener',
    'dal' :'tran',
    'dd'  :'agri',
    'farm':'agri',
    'fdp' :'agri',
    'gnc' :'agri',
    'hes' :'ener',
    'ibm' :'comp',
    'mas' :'cons',
    'mcd' :'food',
    'mon' :'agri',
    'msex':'wate',
    'msft':'comp',
    'nflx':'ente',
    'sbux':'food',
    'strl':'cons',
    'tgt' :'csmr',
    'tsla':'ener',
    'ups' :'tran',
    'xom' :'ener',
    'xpo' :'tran',
    'vmc' :'cons'
}

In [2]:
# Read in each stock, and perform reduction
#    Break into train, test sets
#    Store train, test sets in arrays
# For each regressor predicting a certain num of days...
#    Manually cross validate, testing and scoring each set

In [10]:
# Mix up the list
random.shuffle( inpFileList )

# Set the fraction of data we will train over
percent_train = 0.7

In [20]:
# Store as lists of data for training, testing
quote_list = []


# Loop over inpFileList, so multiple stocks being trained
for inpFile in inpFileList:

    
    ##################################################
    ##############Read in the quote###################
    ##################################################

    
    fileName = 'quotes/' + inpFile + '.csv'

    inpCat = cat_dict[inpFile]

    print inpFile

    print '\tReading quote: ', fileName

    my_quote = rq.readQuote( fileName )

    print '\tDone.\n'

    
    
    
    

    ##################################################
    ############Generate Variables####################
    ##################################################

    print '\tGenerating variables...'

    # The variables we will use for the training data
    
    diffs = sf.generate_differentials   ( my_quote            ).drop('diff_v',axis=1)
    moms  = sf.generate_momentum_close  ( my_quote, mom_nums  )
    rsis  = sf.generate_rsi             ( my_quote, rsi_nums  )
    bands = sf.generate_bollinger_bands ( my_quote, band_nums )

    dates = sf.get_frac_year_vars       ( my_quote            )
    
    categ = sf.get_seasonal_stocks      ( inpCat, my_quote.shape[0] )
    categ.index = my_quote.index
    
    # Log of current price minus 1.5, gives proxy for price percentage movement
    l_cp_m = np.log10( my_quote['close'] ) - 1.5
    
    print '\tDone.\n'

    
    
    
    
    
    ##################################################
    #############Normalize Variables##################
    ##################################################

    print '\tNormalizing variables...'

    # Differentials in a day can be smart scaled
    diffs['diff_hl'] = np.log10( diffs['diff_hl'] )
    for col in diffs.columns:
        diffs[col] = rv.smart_scale( diffs, col, show_plot=False )

    # Momentums can also be smart scaled
    for col in moms.columns:
        moms[col] = rv.smart_scale( moms, col, show_plot=False )

    # RSIs have natural distribution centered at 0.5, scale accordingly
    for col in rsis.columns:
        rsis[col] = ( rsis[col] - 0.5 ) / rsis[col].std()

    # Bands also centered at 0.5, use stdev of bollinger band of 0.25 for scaling
    for col in bands.columns:
        bands[col] = ( bands[col] - 0.5 ) / 0.5


    print '\tDone.\n'

    var_df_list = [ diffs, moms, rsis, bands, dates, categ, l_cp_m ]
    all_train_variables = reduce( lambda left,right: left.join(right,how='inner'), var_df_list )


    
    
    
    
    ##################################################
    #############Generate Target Variables############
    ##################################################


    print '\tGenerating target variables...'

    # The target variables are stored in the data frame
    rolls = sf.generate_rolling_close   ( my_quote, roll_nums, onlyMean=True )
    for i in roll_nums:
        rolls['close_mean_'+str(i)] = ( rolls['close_mean_'+str(i)].shift(i) / rolls['close_mean_'+str(i)] - 1 )
    rolls = rolls.replace( [np.inf, -np.inf], np.nan )


    # The predicted value column heads
    target_list = rolls.columns.values


    print '\tDone.\n'
    
    
    
    
    
    
    
    ##################################################
    #################Train/Test data##################
    ##################################################

    
    # Drops rows containing na, and reverse order for training/testing
    big_df = rolls.join( all_train_variables, how='inner' )
    big_df = big_df[::-1]

    
    quote_list.append( big_df.copy() )


tgt
	Reading quote:  quotes/tgt.csv
	Done.

	Generating variables...
	Done.

	Normalizing variables...
	Done.

	Generating target variables...
	Done.

ups
	Reading quote:  quotes/ups.csv
	Done.

	Generating variables...
	Done.

	Normalizing variables...
	Done.

	Generating target variables...
	Done.

bac
	Reading quote:  quotes/bac.csv
	Done.

	Generating variables...
	Done.

	Normalizing variables...
	Done.

	Generating target variables...
	Done.

hes
	Reading quote:  quotes/hes.csv
	Done.

	Generating variables...
	Done.

	Normalizing variables...
	Done.

	Generating target variables...
	Done.

dd
	Reading quote:  quotes/dd.csv
	Done.

	Generating variables...
	Done.

	Normalizing variables...
	Done.

	Generating target variables...
	Done.

cat
	Reading quote:  quotes/cat.csv
	Done.

	Generating variables...
	Done.

	Normalizing variables...
	Done.

	Generating target variables...
	Done.

xom
	Reading quote:  quotes/xom.csv
	Done.

	Generating variables...
	Done.

	Normalizing variab

In [38]:
rf_estimators_base = 10 # Increase number of estimators each time
rf_estimators      = rf_estimators_base

nn_reg = MLPRegressor         ( hidden_layer_sizes=200, warm_start=True )
rf_reg = RandomForestRegressor( n_estimators=rf_estimators, min_samples_split=10, warm_start=True )
ba_reg = BaggingRegressor     ( n_estimators=rf_estimators, warm_start=True)
#en_reg = ElasticNet           ( warm_start= True )

test_list = []

# loop over all the training and testing stuffs 
for i in range( 0, len(quote_list) ):


    
    ###########################################
    ########Seperate train/test data###########
    ###########################################
    
    big_df = quote_list[i]
    
    # Break up train and test sets
    n_train = int( big_df.shape[0] * percent_train )

    # Split the train and test data
    
    train_df =        big_df[:n_train ]
    test_list.append( big_df[ n_train:] )

    
    # Pure set of y and x data
    # Need to trim bad values
    y_df = train_df     [   target_list         ]
    x_df = train_df.drop(   target_list, axis=1 )
    
#
# Do for each target list member, meaning need to declare regressors here
#
    #for target_col in target_list:
    # Generate a target values and variable values
    target_col = y_df.columns[-1]


    # Loop over target list and train multiple machine learning methods on it:
    # Drops rows containing na, and reverse order for training/testing
    x_train = y_df[target_col].dropna().to_frame().join( x_df, how='inner' ).drop( target_col, axis=1 ).values
    y_train = y_df[target_col].dropna().to_frame().values

# Need to train within parameter space
#    Test and score on other set
#    Select which parameters best

    # Only fit using all but last data
    if ( inpFile != inpFileList[-1] ):
    
        print '\tTraining data...'

        rf_reg.set_params( n_estimators=rf_estimators )
        ba_reg.set_params( n_estimators=rf_estimators )
        
        # Increase the number of trees for next time
        rf_estimators = rf_estimators + rf_estimators_base

        nn_reg.fit( variable_values, target_values )
        rf_reg.fit( variable_values, target_values )
        ba_reg.fit( variable_values, target_values )
#        en_reg.fit( variable_values, target_values )

        print '\tDone.'

        print 'Neural Network internal accuracy of : %7.4f' % ( nn_reg.score( variable_values, target_values ) )
        print 'Random Forest  internal accuracy of : %7.4f' % ( rf_reg.score( variable_values, target_values ) )
        print 'Bagging        internal accuracy of : %7.4f' % ( ba_reg.score( variable_values, target_values ) )
#        print 'Elastic Net    internal accuracy of : %7.4f' % ( en_reg.score( variable_values, target_values ) )
        print ''
        
    else:
        print 'Neural Network fit accuracy: %7.4f' % nn_reg.score( variable_values, target_values )
        print 'Random Forest  fit accuracy: %7.4f' % rf_reg.score( variable_values, target_values )
        print 'Bagging        fit accuracy: %7.4f' % ba_reg.score( variable_values, target_values )
#        print 'Elastic Net    fit accuracy: %7.4f' % en_reg.score( variable_values, target_values )

[[-4.4096038   1.82589606 -0.07850957 ...,  0.          0.          0.28182715]
 [ 1.59833526  1.87300394  0.87931431 ...,  0.          0.          0.2895102 ]
 [ 2.88633354  1.66182063 -0.39104352 ...,  0.          0.          0.29699039]
 ..., 
 [ 0.08072291 -1.88252355  0.14796161 ...,  0.          0.          0.27815125]
 [ 0.08050639 -2.88544371  0.08633639 ...,  0.          0.          0.27945218]
 [ 0.29492475 -1.50484833  0.65317489 ...,  0.          0.          0.28326023]]
[[-0.04629238]
 [-0.0383708 ]
 [-0.03407819]
 ..., 
 [ 0.0225609 ]
 [ 0.02096599]
 [ 0.01531514]]


In [71]:
plot_frame = pd.DataFrame( {'true':target_values} )
plot_frame['random_forest' ] = rf_reg.predict( variable_values ) - plot_frame['true']
plot_frame['neural_network'] = nn_reg.predict( variable_values ) - plot_frame['true']
plot_frame['bagging'       ] = ba_reg.predict( variable_values ) - plot_frame['true']
plot_frame['average'       ] = (plot_frame['random_forest' ]+
                                plot_frame['bagging'       ]+
                                plot_frame['neural_network'])/3

In [72]:
ll   = 0.5
reg1 = 'random_forest'
reg2 = 'neural_network'
reg3 = 'bagging'
reg4 = 'average'

ax1 = plot_frame.plot(kind='scatter', x='true', y=reg1, color='g', alpha=0.3, label=reg1 )    
ax2 = plot_frame.plot(kind='scatter', x='true', y=reg2, color='r', alpha=0.3, label=reg2, ax=ax1)
ax3 = plot_frame.plot(kind='scatter', x='true', y=reg3, color='b', alpha=0.3, label=reg3, ax=ax1)
ax4 = plot_frame.plot(kind='scatter', x='true', y=reg4, color='k', alpha=0.3, label=reg4, ax=ax1)

#ax1.plot( [-ll,ll], [-ll,ll], color='k' )
ax1.set_ybound( [-ll,ll] )
ax1.set_xlabel( 'True Value' )
ax1.set_ylabel( 'predicted-true' )
ax1.legend( loc=2 )

plt.show()