In [37]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
import math, os, sys, datetime
from pandas_datareader import data as pdr
from datetime import date, timedelta
import yfinance as yf
yf.pdr_override()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from functools import reduce
from sklearn import preprocessing
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
from utils.preprocessstock import preprocess
import tensorflow_probability as tfp
from ta import add_all_ta_features
from ta.utils import dropna
from sklearn.metrics import mean_squared_error  as MSE
from sklearn.metrics import explained_variance_score

#Tickers list
#We can add and delete any ticker from the list to get desired ticker live data
tickers=['AAPL']
# tickers=['AAPL','AMZN','FB','GOOG','MMM','MSFT','NFLX','NKE','NVDA','INTC','CSCO','WMT','TSLA','EBAY',
#          'ORCL','CBG.BK','OSP.BK','BBL.BK','SCB.BK','LH.BK','MINT.BK','PTT.BK','BANPU.BK','ADVANC.BK',
#          'TRUE.BK','AOT.BK','BEM.BK','BTS.BK','CPALL.BK','CPN.BK']
# tickers=['CBG.BK','OSP.BK','BBL.BK','SCB.BK','LH.BK','MINT.BK','PTT.BK','BANPU.BK','ADVANC.BK',
#          'TRUE.BK','AOT.BK','BEM.BK','BTS.BK','CPALL.BK','CPN.BK']
# 'AAPL','AMZN','FB','GOOG','MMM','MSFT','NFLX','NKE','NVDA','INTC','CSCO','WMT','TSLA','EBAY',
#         'ORCL','CBG.BK','OSP.BK','BBL.BK','SCB.BK','LH.BK','MINT.BK','PTT.BK','BANPU.BK','ADVANC.BK',
#          'TRUE.BK','AOT.BK','BEM.BK','BTS.BK','CPALL.BK','CPN.BK'

startdate = datetime.datetime(2017, 1, 13)
enddate = datetime.datetime(2021, 1, 1)
timesteps = 7

In [38]:
def evaluate(model, test_features, test_labels, getmax, getmin):
    predictions = model.predict(test_features)
    predictions = predictions.reshape(1,-1)[0]
    for i in range(0, len(predictions)):
        predictions[i] = (predictions[i]*(getmax-getmin))+getmin
    test_labels = test_labels.reshape(1,-1)[0]
    for i in range(0, len(test_labels)):
        test_labels[i] = (test_labels[i]*(getmax-getmin))+getmin   
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('RMSE: {:0.4f}' .format(math.sqrt(MSE(test_labels, predictions))))
    print('EVS: {:0.4f}' .format(explained_variance_score(test_labels, predictions)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [57]:
from sklearn.model_selection import GridSearchCV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
parameters = {'n_estimators': n_estimators, 'max_depth':max_depth,
             'max_features':['auto'], 'min_samples_split':[2, 5], 'min_samples_leaf':[4],
             'bootstrap':[False]}
rf = RandomForestRegressor()
clf = GridSearchCV(rf, parameters)
print(parameters)

{'n_estimators': [100, 200, 300, 400, 500], 'max_depth': [10, 20, 30, 40, 50, None], 'max_features': ['auto'], 'min_samples_split': [2, 5], 'min_samples_leaf': [4], 'bootstrap': [False]}


In [58]:
# model_rf = RandomForestRegressor(n_estimators=1000 ,max_depth=10,random_state=0)
for stocks in tickers:
    # Load Data
    ticker = yf.Ticker(stocks)
    data = ticker.history(start=startdate, end=enddate) 
    data['next_Close'] = data['Close'].shift(-7)
    data = data.drop(columns=['Dividends', 'Stock Splits'])
    data = dropna(data)
    # Add Indicator
    data = add_all_ta_features(data, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)
    data = data.drop(columns=['Volume', 'Open', 'High', 'Low'])
    # Feature Selection
    y = data['next_Close']
    featureScores = pd.DataFrame(data[data.columns[1:]].corr()['next_Close'][:])
    x_list = []
    for i in range(0, len(featureScores)):
        if abs(featureScores.next_Close[i]) > 0.90:
            x_list.append(featureScores.index[i])
    X = data[x_list]
    X = X.drop(columns=['next_Close'])
    sfs1 = SFS(LinearRegression(), k_features=(1,5), forward=True, floating=False, cv=0)
    sfs1.fit(X, y)
    k_feature_names = list(sfs1.k_feature_names_)
    features = data[k_feature_names]
    # Perporcess
    min_max_scaler = preprocessing.MinMaxScaler()
    features = min_max_scaler.fit_transform(features)
    features = features[:len(features)//timesteps*timesteps].reshape((len(features)//timesteps, timesteps, 5))
    
    labels = data[['next_Close']]
    getmax = labels.max()
    getmin = labels.min()
    labels = min_max_scaler.fit_transform(labels)
    labels = labels[:len(labels)//timesteps*timesteps].reshape((len(labels)//timesteps, timesteps, 1))
    labels = np.squeeze(labels)
    
    train_test_split_factor = .80
    validation_split_factor = .20
    train_x, train_y, test_x, test_y = features[:math.floor(len(features)*train_test_split_factor)], labels[:math.floor(len(labels)*train_test_split_factor)], features[math.floor(len(features)*train_test_split_factor):], labels[math.floor(len(labels)*train_test_split_factor):]
    train_x, test_x = np.expand_dims(train_x, axis=1), np.expand_dims(test_x, axis=1)
    train_x, test_x = np.expand_dims(train_x, axis=-1), np.expand_dims(test_x, axis=-1)
    train_x = train_x.reshape(train_x.shape[0], 35).astype('float32')
    test_x = test_x.reshape(test_x.shape[0], 35).astype('float32')
    
    clf.fit(train_x, train_y)
    print(f'{"="*52}')
    print(f"Stock : {stocks}")
    print('Best parameters found:\n', clf.best_params_)

Stock : AAPL
Best parameters found:
 {'bootstrap': False, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [3]:
model_rf = RandomForestRegressor(n_estimators=1000 ,max_depth=10,random_state=0)
for stocks in tickers:
    # Load Data
    ticker = yf.Ticker(stocks)
    data = ticker.history(start=startdate, end=enddate) 
    data['next_Close'] = data['Close'].shift(-7)
    data = data.drop(columns=['Dividends', 'Stock Splits'])
    data = dropna(data)
    # Add Indicator
    data = add_all_ta_features(data, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)
    data = data.drop(columns=['Volume', 'Open', 'High', 'Low'])
    # Feature Selection
    y = data['next_Close']
    featureScores = pd.DataFrame(data[data.columns[1:]].corr()['next_Close'][:])
    x_list = []
    for i in range(0, len(featureScores)):
        if abs(featureScores.next_Close[i]) > 0.90:
            x_list.append(featureScores.index[i])
    X = data[x_list]
    X = X.drop(columns=['next_Close'])
    sfs1 = SFS(LinearRegression(), k_features=(1,5), forward=True, floating=False, cv=0)
    sfs1.fit(X, y)
    k_feature_names = list(sfs1.k_feature_names_)
    features = data[k_feature_names]
    # Perporcess
    min_max_scaler = preprocessing.MinMaxScaler()
    features = min_max_scaler.fit_transform(features)
    features = features[:len(features)//timesteps*timesteps].reshape((len(features)//timesteps, timesteps, 5))
    
    labels = data[['next_Close']]
    getmax = labels.max()
    getmin = labels.min()
    labels = min_max_scaler.fit_transform(labels)
    labels = labels[:len(labels)//timesteps*timesteps].reshape((len(labels)//timesteps, timesteps, 1))
    labels = np.squeeze(labels)
    
    train_test_split_factor = .80
    validation_split_factor = .20
    train_x, train_y, test_x, test_y = features[:math.floor(len(features)*train_test_split_factor)], labels[:math.floor(len(labels)*train_test_split_factor)], features[math.floor(len(features)*train_test_split_factor):], labels[math.floor(len(labels)*train_test_split_factor):]
    train_x, test_x = np.expand_dims(train_x, axis=1), np.expand_dims(test_x, axis=1)
    train_x, test_x = np.expand_dims(train_x, axis=-1), np.expand_dims(test_x, axis=-1)
    train_x = train_x.reshape(train_x.shape[0], 35).astype('float32')
    test_x = test_x.reshape(test_x.shape[0], 35).astype('float32')
    
    model_rf.fit(train_x, train_y)
    print(f'{"="*52}')
    print(f"Stock : {stocks}")
    accuracy = evaluate(model_rf, test_x, test_y, getmax, getmin)


Stock : AAPL
Model Performance
Average Error: 24.9755 degrees.
RMSE: 30.4394
EVS: 0.1271
Accuracy = 77.01%.
Stock : AMZN
Model Performance
Average Error: 841.5990 degrees.
RMSE: 948.8419
EVS: 0.0355
Accuracy = 72.19%.
Stock : FB
Model Performance
Average Error: 49.0097 degrees.
RMSE: 56.2235
EVS: 0.2897
Accuracy = 80.71%.
Stock : GOOG
Model Performance
Average Error: 126.1142 degrees.
RMSE: 171.5503
EVS: 0.3726
Accuracy = 91.80%.
Stock : MMM
Model Performance
Average Error: 7.7839 degrees.
RMSE: 10.8245
EVS: 0.3971
Accuracy = 94.53%.
Stock : MSFT
Model Performance
Average Error: 25.0017 degrees.
RMSE: 28.5398
EVS: 0.2852
Accuracy = 87.60%.
Stock : NFLX
Model Performance
Average Error: 82.0956 degrees.
RMSE: 92.5087
EVS: 0.2936
Accuracy = 83.14%.
Stock : NKE
Model Performance
Average Error: 15.1205 degrees.
RMSE: 19.7600
EVS: 0.3005
Accuracy = 86.68%.
Stock : NVDA
Model Performance
Average Error: 168.0008 degrees.
RMSE: 196.4580
EVS: 0.0180
Accuracy = 64.25%.
Stock : INTC
Model Performa

In [4]:
# save model
import joblib
joblib.dump(model_rf, 'weights/rf.sav')

['weights/rf.sav']