#  Extraction of stock prices from Yahoo Finance and calculate for technical indicators

In [1]:
# import libraries
import numpy as np 
import pandas as pd 
import yfinance as yf
from datetime import date, timedelta, datetime
import ta

<p style="text-align:justify">Extract stock prices from yahoo finance using its api. We set the duration parameter as max to get all the stock information of a specific symbol.</p>

In [None]:
def fetch_data(symbols, **kwargs):
    """
    Get historical data of market with period of 1 day
    Parameters
    ===========
    symbols        : list 
                     company symbol (e.g GME, MSFT, etc)
    **kwargs       : startDate/endDate/period
                     Valid periods are: 1d, 5d, 1mo, 3mo, 
                          6mo, 1y, 2y, 5y, 10y, ytd, max

    Returns:
    Dataframe of ticker data within period of start and end date
    """
    # get start and end time for historical data
    period, startDate, endDate = [None] * 3
    if 'period' in kwargs.keys():
        period = period
    else:
        startDate, endDate = _set_duration(**kwargs)

    stock_list = []
    # loop through all the symbols
    API_CALLS = 0
    MAX_API_CALLS = 1800 # defined by yahoo finance
    for symbol in symbols:
        tickerData = yf.Ticker(symbol.upper())
        df_ticker = tickerData.history(period=period, 
                                       start=startDate, 
                                       end=endDate)  
        df_ticker['Symbol'] = symbol.upper()
        stock_list.append(df_ticker)
        API_CALLS += 1

        if API_CALLS >= MAX_API_CALLS:
            print(f"""You have reached max limit
            for API calls ({MAX_API_CALLS})""")
            break

    df_stocks = pd.concat(stock_list, axis=0)

    # remove unnecessary columns
    df_stocks.drop(columns=['Dividends', 'Stock Splits'], inplace=True)

    return df_stocks

In [None]:
df = fetch_data(['TSLA'], period='max')

<p style="text-align:justify">Let us also compute for the most common technical indicators that are used to formulate a trading strategy. We can use this as a baseline for the backtesting compared against our machine learning model. Our machine learning model could also work in conjunction with these technical indicators.</p>

In [None]:
def compute_rsi(df, window=14, field='Close'):
    """
    Computes RSI values
    Parameters:
    df : dataframe of 1 symbol containing field of choice
    window : period
    field : field to perform computation of rsi

    Returns:
    dataframe of RSI values
    """
    # copy the dataframe object to avoid unnecessary override
    df = df.copy()

    # ensure the index has correct dtype before sorting
    df.index = pd.to_datetime(df.index)
    df.sort_index(ascending=True, inplace=True)

    # compute for RSI 
    RSI = ta.momentum.RSIIndicator(df[field], window=window)
    rsi_series = RSI.rsi()

    return rsi_series.to_frame(name=f'RSI({window})')

def compute_crossover(df, duration=(7, 20), field='Close'):
    """
    Parameters:
    df : dataframe of 1 symbol containing field of choice
    duration : tuple of short and long duration
    field : field to perform moving averages

    Returns:
    dataframe of moving averages of short and long duration
    """
    # copy the dataframe object to avoid unnecessary override
    df = df.copy()

    # compute for moving averages
    dur_short, dur_long = duration

    df[f'ma{dur_short}'] = df[field].rolling(window=dur_short).mean()
    df[f'ma{dur_long}'] = df[field].rolling(window=dur_long).mean()

    return df.loc[:, [f'ma{dur_short}', f'ma{dur_long}']]

def compute_macd(df, field='Close'):
    """
    Parameters:
    df : dataframe of 1 symbol containing field of choice
    field : field to perform moving averages

    Returns:
    dataframe of macd and its signal line
    """
    # copy the dataframe object to avoid unnecessary override
    df = df.copy()

    # compute for macd
    macd = ta.trend.MACD(df[field])
    df['MACD'] = macd.macd()
    df['signal_line'] = macd.macd_signal()

    return df.loc[:, ['MACD', 'signal_line']]
  
def compute_bollinger_band(df, field='Close'):
    """
    Parameters:
    df : dataframe of 1 symbol containing field of choice
    field : field to perform moving averages and std

    Returns:
    dataframe of center line, upper, and lower band
    """
    # copy the dataframe object to avoid unnecessary override
    df = df.copy()

    # compute for bollinger bands
    bollinger = ta.volatility.BollingerBands(df[field])
    df['center_line'] = bollinger.bollinger_mavg() 
    df['bollinger_high'] = bollinger.bollinger_hband()
    df['bollinger_low'] = bollinger.bollinger_lband()
  
    return df.loc[:, ['bollinger_high', 'bollinger_low', 'center_line']]

def compute_stochastics(df, field='Close'):
    """
    Parameters:
    df : dataframe of 1 symbol containing field of choice
    field : field to perform stochastics

    Returns:
    dataframe of %K and %D
    """
    # copy the dataframe object to avoid unnecessary override
    df = df.copy()

    # compute for stochastics
    stochastics = ta.momentum.StochasticOscillator(high=df['High'], 
                                                 low=df['Low'],
                                                 close=df[field])
    df['%K'] = stochastics.stoch()

    df['%D'] = stochastics.stoch_signal()

    return df.loc[:, ['%K', '%D']]

def get_technical_indicators(df, field='Close'):
    """
    Parameters
    ==========
    df        :  pd.DataFrame 
                 1 symbol containing field of choice
    field     :  str
                 field to perform technical indicators computation

    Returns:
    dataframe of technical indicators 
    """
    df = df.copy()
    indicator_functions = [compute_rsi, compute_crossover, compute_macd, 
                         compute_bollinger_band, compute_stochastics]
    indicators = []

    for ind_func in indicator_functions:
        try:
            indicators.append(ind_func(df, field=field))
        except IndexError as ie:
            # meaning no need to compute for indicator as 
            # there are no null values
            pass

        for indicator in indicators:
            df.loc[df.index.isin(indicator.index),
            indicator.columns] = indicator

    return df

In [None]:
df = get_technical_indicators(df).dropna()

<p style="text-align:justify">We then combine the stocks with the average tone scores extracted from the Amazon datasets and processed using Amazon EMR.</p>

In [None]:
keyword = 'tesla'

tone = pd.read_csv('data/{}.csv'.format(keyword))
tone['Date'] = (tone['DATEADDED'].astype('str')
                                 .apply(lambda x: datetime(int(x[:4]), 
                                                  int(x[4:6]), 
                                                  int(x[6:]))))

tone = tone.drop('DATEADDED', axis=1)

In [9]:
df_full

Unnamed: 0,Date,Open,High,Low,Close,Volume,RSI(14),ma7,ma20,MACD,signal_line,bollinger_high,bollinger_low,center_line,%K,%D,target,Daily Average Tone
0,2015-02-18,40.834000,41.234001,40.520000,40.891998,13568000,43.815253,41.772285,41.6841,-0.172222,-0.068098,44.590532,38.777668,41.6841,34.720485,33.892348,1,4.000000
1,2015-02-19,41.000000,42.487999,40.750000,42.341999,25770500,52.319192,41.607428,41.8355,-0.108378,-0.076154,44.540819,39.130182,41.8355,57.236016,42.111793,1,0.350000
2,2015-02-20,42.155998,43.520000,41.962002,43.422001,29910500,57.481252,41.630857,41.9904,0.029032,-0.055117,44.686553,39.294247,41.9904,74.006224,55.320908,0,0.021739
3,2015-02-23,43.132000,43.639999,41.265999,41.467999,42499000,47.468180,41.474857,42.0509,-0.019517,-0.047997,44.641062,39.460738,42.0509,43.664584,58.302275,0,0.181818
4,2015-02-24,41.458000,41.458000,40.340000,40.821999,33018000,44.696118,41.509999,42.0265,-0.108865,-0.060171,44.653059,39.399941,42.0265,33.633533,50.434781,0,1.162791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,2019-04-10,55.348000,55.675999,54.577999,55.212002,35306500,47.103524,55.485715,55.0718,-0.755063,-0.929796,58.404505,51.739094,55.0718,51.786167,46.503653,0,0.187500
1042,2019-04-11,53.660000,54.099998,53.119999,53.683998,49179500,42.992569,54.986857,54.8564,-0.828947,-0.909627,57.954951,51.757848,54.8564,33.469178,42.683622,0,-1.000000
1043,2019-04-12,54.043999,54.389999,53.366001,53.540001,33730000,42.615112,54.298000,54.7791,-0.888875,-0.905476,57.927617,51.630583,54.7791,19.983146,35.079497,0,-0.137931
1044,2019-04-15,53.726002,53.776001,51.726002,53.276001,50193000,41.888993,54.258001,54.7480,-0.946756,-0.913732,57.942665,51.553335,54.7480,20.644637,24.698987,1,-0.650000


In [None]:
df_full.to_csv('data/{}_completed.csv'.format(keyword), index=False)