In [6]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import read_quote      as rq
import stock_functions as sf
import df_visualizations as dv
import remap_values as rv

import random
import pickle

import sys
import time

In [1]:
from sklearn.svm            import SVR
from sklearn.ensemble       import RandomForestRegressor
from sklearn.linear_model   import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble       import BaggingRegressor
from sklearn.ensemble       import AdaBoostRegressor

from sklearn.multioutput    import MultiOutputRegressor

from sklearn.utils          import shuffle

from sklearn.metrics        import mean_squared_error

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.multioutput    import MultiOutputRegressor

In [2]:
roll_nums = [3,5,7,10,12,15,20,25]#[3,5,7,10,12,15,18,20,22,25,28,30] # Generate an algorithm for predicting every few days
mom_nums  = [3,5,8,10,15,20,25,30]                    # Momentum has many good tracers
rsi_nums  = [10,15,20,25,30]                          # Good for some long term trends
band_nums = [5,8,10,15,20,25]                         # A few trace different areas well

In [3]:
inpFileList = ['aapl','acm' ,'awk' ,'amzn','awr' ,'ba'  ,'bac' ,'c'   ,'cat' ,'cop' ,'cvx' ,'dal' ,
               'dd'  ,'farm','fdp' ,'gnc' ,'hes' ,'ibm' ,'mas' ,'mcd' ,'mon' ,'msex','msft','nflx',
               'sbux','strl','tgt' ,'tsla','ups' ,'xom' ,'xpo' ,'vmc' ]

In [7]:
scaled_df = sf.get_scaled_data( inpFileList, roll_nums, mom_nums, rsi_nums, band_nums )

In [8]:
scaled_df = sf.gen_pca_attributes( scaled_df, mom_nums, rsi_nums, band_nums, n_mom=3, n_rsi=2, n_ban=2 )

In [7]:
# Read in each stock, and perform reduction
#    Break into train, test sets
#    Store train, test sets in arrays
# For each regressor predicting a certain num of days...
#    Manually cross validate, testing and scoring each set

In [9]:
# Mix up the list
shuff_scaled_df = shuffle( scaled_df, random_state=0 )


# Seperate features from target data
close_list = ['close']
for i in roll_nums:
    close_list.append( 'close_mean_'+str(i) )

targets  = shuff_scaled_df[close_list]
features = shuff_scaled_df.drop( close_list, axis=1 )



n_elements = shuff_scaled_df.shape[0]

# Set the fraction of data we will train over
percent_train = 0.8

# Size of training set
n_train = int( percent_train * n_elements )


# Seperate training and test data
train_features = features[ : n_train   ]
test_features  = features[   n_train : ]

train_targets  =  targets[ : n_train   ]
test_targets   =  targets[   n_train : ]

In [10]:
# Gets the location of not null values
train_not_null_locations = train_targets.notnull().all( axis=1 )==1
test_not_null_locations  =  test_targets.notnull().all( axis=1 )==1

# Take features and targets that are not null
train_x = train_features[ train_not_null_locations ].values
test_x  =  test_features[  test_not_null_locations ].values

train_y = train_targets[ train_not_null_locations ].values
test_y  =  test_targets[  test_not_null_locations ].values

ad_mult_reg = MultiOutputRegressor( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=20, max_features=0.7 ), random_state=0, 
                            loss='exponential', n_estimators=10 ) )
#ad_mult_reg = MultiOutputRegressor( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=20, max_features=0.7 ), random_state=0, 
#                            loss='exponential', n_estimators=5 ) )

ad_mult_reg.fit( train_x, train_y )

MultiOutputRegressor(estimator=AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.7, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
         learning_rate=1.0, loss='exponential', n_estimators=10,
         random_state=0),
           n_jobs=1)

In [9]:
test_60 = ad_mult_reg.predict( test_x )

for i in range( 1, len( close_list[1:] ) ) :
    mse = mean_squared_error( test_y[i-1], test_60[i-1] )
    print '%13s MSE : %8.5f, Percent ME : %8.5f' % ( close_list[i], mse, np.sqrt(mse)/np.mean(test_y) )

 close_mean_3 MSE :  0.28736, Percent ME : 36.03992
 close_mean_5 MSE :  0.01817, Percent ME :  9.06243
 close_mean_7 MSE :  0.02099, Percent ME :  9.73959
close_mean_10 MSE :  0.00909, Percent ME :  6.41166
close_mean_12 MSE :  0.02232, Percent ME : 10.04452
close_mean_15 MSE :  0.02216, Percent ME : 10.00922


In [29]:
test_70 = ad_mult_reg.predict( test_x )

for i in range( 1, len( close_list[1:] ) ) :
    mse = mean_squared_error( test_y[i-1], test_70[i-1] )
    print '%13s MSE : %8.5f, Percent ME : %8.5f' % ( close_list[i], mse, np.sqrt(mse)/np.mean(test_y) )

 close_mean_3 MSE :  0.11175, Percent ME : 22.26275
 close_mean_5 MSE :  0.02046, Percent ME :  9.52490
 close_mean_7 MSE :  0.01597, Percent ME :  8.41539
close_mean_10 MSE :  0.00119, Percent ME :  2.29876
close_mean_12 MSE :  0.23849, Percent ME : 32.52288
close_mean_15 MSE :  0.26986, Percent ME : 34.59590


In [12]:
test_80 = ad_mult_reg.predict( test_x )

for i in range( 1, len( close_list[1:] )+1 ) :
    mse = mean_squared_error( test_y[i-1], test_80[i-1] )
    print '%13s MSE : %8.5f, Percent ME : %8.5f' % ( close_list[i], mse, np.sqrt(mse)/np.mean(test_y) )

 close_mean_3 MSE :  0.00034, Percent ME :  1.28400
 close_mean_5 MSE :  0.00052, Percent ME :  1.57940
 close_mean_7 MSE :  0.00769, Percent ME :  6.09548
close_mean_10 MSE :  0.00520, Percent ME :  5.01383
close_mean_12 MSE :  0.04772, Percent ME : 15.18780
close_mean_15 MSE :  0.01710, Percent ME :  9.09208
close_mean_20 MSE :  0.04250, Percent ME : 14.33349
close_mean_25 MSE :  0.10339, Percent ME : 22.35573


In [12]:
test_80_5 = ad_mult_reg.predict( test_x )

for i in range( 1, len( close_list[1:] )+1 ) :
    mse = mean_squared_error( test_y[i-1], test_80_5[i-1] )
    print '%13s MSE : %8.5f, Percent ME : %8.5f' % ( close_list[i], mse, np.sqrt(mse)/np.mean(test_y) )

 close_mean_3 MSE :  0.00075, Percent ME :  1.80002
 close_mean_5 MSE :  0.00311, Percent ME :  3.67110
 close_mean_7 MSE :  0.03112, Percent ME : 11.60937
close_mean_10 MSE :  0.00762, Percent ME :  5.74298
close_mean_12 MSE :  0.06341, Percent ME : 16.57087
close_mean_15 MSE :  0.01113, Percent ME :  6.94384
close_mean_25 MSE :  0.02668, Percent ME : 10.74902


In [26]:
reg_list = []

# Features are the same, but targets training on will vary
# As will which features missing ( due to ways rolling mean calculated )
# Skip the next days, close, but otherwise use rolling means
for col in close_list[1:]:

    # Gets the location of null values
    train_not_null_locations = train_targets[col].isnull()==0
    test_not_null_locations  =  test_targets[col].isnull()==0
    
    # Take features and targets that are not null
    train_x = train_features[ train_not_null_locations ].values
    test_x  =  test_features[  test_not_null_locations ].values
    
    train_y = train_targets[ train_not_null_locations ][col].values
    test_y  =  test_targets[  test_not_null_locations ][col].values
    
#    reg_list=( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=20, max_features=0.7 ), random_state=0, 
#                            loss='exponential', n_estimators=5 ) )
    
    reg_list.append( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=20, max_features=0.7 ), random_state=0, 
                            loss='exponential', n_estimators=10 ) )

    reg_list[-1].fit( train_x, train_y )
#    reg_list.fit( train_x, train_y )
    
#    ad_y = reg_list.predict( test_x )
    ad_y = reg_list[-1].predict( test_x )
    mse = mean_squared_error( ad_y, test_y )
    
    print '%13s MSE : %8.5f, Percent ME : %8.5f' % ( col, mse, np.sqrt(mse)/np.mean(test_y) )

 close_mean_3 MSE :  0.00101, Percent ME : 20.32958
 close_mean_5 MSE :  0.00127, Percent ME : 15.84783
 close_mean_7 MSE :  0.00152, Percent ME : 13.28560
close_mean_10 MSE :  0.00190, Percent ME : 10.66902
close_mean_12 MSE :  0.00214, Percent ME :  9.62483
close_mean_15 MSE :  0.00255, Percent ME :  8.32091
close_mean_25 MSE :  0.00397, Percent ME :  6.10079


reg_list=( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=20, max_features=0.7 ), random_state=0, 
                            loss='exponential', n_estimators=10 ) )
                            
close_mean_3  MSE :  0.00100721061963

close_mean_5  MSE :  0.00126550114664

close_mean_15  MSE :  0.00254971575641

close_mean_25  MSE :  0.00397021282384

reg_list=( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=20, max_features=0.6 ), random_state=0, 
                            loss='exponential', n_estimators=10 ) )

 close_mean_3 MSE :  0.00101, Percent ME : 20.33788
            
 close_mean_5 MSE :  0.00127, Percent ME : 15.85799
        
close_mean_15 MSE :  0.00254, Percent ME :  8.30605
        
close_mean_25 MSE :  0.00395, Percent ME :  6.08495

reg_list=( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=20, max_features=0.8 ), random_state=0, 
                            loss='exponential', n_estimators=10 ) )

 close_mean_3 MSE :  0.00101, Percent ME : 20.39407
 
 close_mean_5 MSE :  0.00128, Percent ME : 15.91946
 
close_mean_15 MSE :  0.00253, Percent ME :  8.28805

close_mean_25 MSE :  0.00397, Percent ME :  6.09778

reg_list=( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=30, max_features=0.7 ), random_state=0, 
                            loss='exponential', n_estimators=10 ) )

 close_mean_3 MSE :  0.00100, Percent ME : 20.30114
        
 close_mean_5 MSE :  0.00126, Percent ME : 15.80350
        
close_mean_15 MSE :  0.00254, Percent ME :  8.31250
        
close_mean_25 MSE :  0.00394, Percent ME :  6.08079

reg_list=( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=10, max_features=0.7 ), random_state=0, 
                            loss='exponential', n_estimators=10 ) )

 close_mean_3 MSE :  0.00102, Percent ME : 20.50482
 
 close_mean_5 MSE :  0.00128, Percent ME : 15.91072
 
close_mean_15 MSE :  0.00259, Percent ME :  8.38302

close_mean_25 MSE :  0.00399, Percent ME :  6.11756

reg_list=( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=20, max_features=0.7 ), random_state=0, 
                            loss='exponential', n_estimators=20 ) )
                            
 close_mean_3 MSE :  0.00100, Percent ME : 20.23163
 
 close_mean_5 MSE :  0.00126, Percent ME : 15.79081
 
close_mean_15 MSE :  0.00251, Percent ME :  8.25161

close_mean_25 MSE :  0.00391, Percent ME :  6.05090

reg_list=( AdaBoostRegressor( base_estimator=RandomForestRegressor( n_estimators=20, max_features=0.7 ), random_state=0, 
                            loss='exponential', n_estimators=5 ) )

 close_mean_3 MSE :  0.00102, Percent ME : 20.40833
            
 close_mean_5 MSE :  0.00128, Percent ME : 15.95991
        
close_mean_15 MSE :  0.00259, Percent ME :  8.38906
        
close_mean_25 MSE :  0.00405, Percent ME :  6.16066