In [1]:
import yfinance as yf
import datetime
import os
import pandas as pd
import numpy as np
from finta import TA
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report



In [2]:
"""
Defining some constants for data mining
"""

NUM_DAYS = 1000     # The number of days of historical data to retrieve
INTERVAL = '1d'     # Sample rate of historical data
symbol = 'AMD'      # Symbol of the desired stock

# List of symbols for technical indicators
INDICATORS = ['RSI', 'MACD', 'STOCH', 'ADL', 'ATR', 'MOM', 'MFI', 'ROC', 'OBV', 'CCI', 'EMV', 'VORTEX']

In [3]:
"""
Next we pull the historical data using yfinance
Rename the column names because finta uses the lowercase names
"""

start = (datetime.date.today() - datetime.timedelta( NUM_DAYS) )
end = datetime.datetime.today()

data = yf.download(symbol, start=start, end=end, interval=INTERVAL)
data.rename(columns={"Close": 'close', "High": 'high', "Low": 'low', 'Volume': 'volume', 'Open': 'open'}, inplace=True)

[*********************100%***********************]  1 of 1 completed


In [4]:
"""
Next we clean our data and perform feature engineering to create new technical indicator features that our
model can learn from
"""

def _exponential_smooth(data, alpha):
    """
    Function that exponentially smooths dataset so values are less 'rigid'
    :param alpha: weight factor to weight recent values more
    """
    
    return data.ewm(alpha=alpha).mean()
    
def _get_indicator_data(data):
    """
    Function that uses the finta API to calculate technical indicators used as the features
    :return:
    """

    for indicator in INDICATORS:
        ind_data = eval('TA.' + indicator + '(data)')
        if not isinstance(ind_data, pd.DataFrame):
            ind_data = ind_data.to_frame()
        data = data.merge(ind_data, left_index=True, right_index=True)
    data.rename(columns={"14 period EMV.": '14 period EMV'}, inplace=True)

    # Also calculate moving averages for features
    data['ema50'] = data['close'] / data['close'].ewm(50).mean()
    data['ema21'] = data['close'] / data['close'].ewm(21).mean()
    data['ema14'] = data['close'] / data['close'].ewm(14).mean()
    data['ema5'] = data['close'] / data['close'].ewm(5).mean()

    # Remove columns that won't be used as features
    del (data['open'])
    del (data['high'])
    del (data['low'])
    del (data['volume'])
    del (data['Adj Close'])
    
    return data
    
def _produce_prediction(data, window=10):
    """
    Function that produces the 'truth' values
    At a given row, it looks 'window' rows ahead to see if the price increased (1) or decreased (0)
    :param window: number of days, or rows to look ahead to see what the price did
    """

    prediction = (data.shift(-window)['close'] >= data['close'])
    prediction = prediction.iloc[:-window]
    data['pred'] = prediction.astype(int)
    
    return data

data = _exponential_smooth(data, 0.9)
data = _get_indicator_data(data)
data = _produce_prediction(data, window=15)
del (data['close'])
data = data.dropna()

In [5]:
def _split_data(data):

        """
        Function to partition the data into the train and test set
        :return:
        """

        y = data['pred']
        features = [x for x in data.columns if x not in ['pred']]
        X = data[features]

        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 2 * len(X) // 3)
        return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = _split_data(data)

In [6]:
"""
Next we create and train our random forest classifier
"""

def _train_random_forest(X_train, y_train, X_tesst, y_test):

    """
    Function that uses random forest classifier to train the model
    :return:
    """

    rf = RandomForestClassifier(n_jobs=-1, n_estimators=85, random_state=65)
    rf.fit(X_train, y_train.values.ravel())
    prediction = rf.predict(X_test)

    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    print(rf.feature_importances_)
    
_train_random_forest(X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

         0.0       0.89      0.76      0.82        78
         1.0       0.88      0.95      0.91       142

    accuracy                           0.88       220
   macro avg       0.89      0.85      0.87       220
weighted avg       0.88      0.88      0.88       220

[[ 59  19]
 [  7 135]]
[0.04012027 0.07203283 0.06833689 0.03587289 0.11520538 0.08357104
 0.04883955 0.05141975 0.03324876 0.03757946 0.07535557 0.03432065
 0.06868308 0.0291268  0.02444684 0.05515354 0.04284054 0.04107011
 0.04277604]
