<a href="https://colab.research.google.com/github/luck058/yfinance-prediction/blob/main/import_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
%%capture
!pip install yfinance

In [33]:
import yfinance as yf
import pandas as pd
import numpy as np

In [34]:
def help():
    print('# Set up data fetcher using:')
    print('data_fetcher = YahooFinanceDataFetcher(self, ticker_symbol, period="max", interval="1d")\n')
    print('# Get historical data using:')
    print('df_data = data_fetcher.get_data(verbose=False)\n')
    print('# Get financials using:')
    print('df_financials = data_fetcher.get_financials(verbose=False)\n')
    print('# Get actions using:')
    print('df_actions = data_fetcher.get_actions(verbose=False)\n')

In [35]:
class YahooFinanceDataFetcher:
    """All functions defined in this notebook are added to this class"""
    def __init__(self, ticker_symbol, period="max", interval="1d", train_frac=0.8):
        self.ticker_symbol = ticker_symbol
        self.ticker = yf.Ticker(ticker_symbol)
        self.period = period
        self.interval = interval
        # Fraction of the data which will be used for training (last section will be reserved for testing)
        self.train_frac = train_frac
        self.X = None
        self.y = None

In [46]:
def get_raw_data(self, verbose=False):
    """Get historical data from Yahoo Finance. If reload=False, gets saved data from GitHub if available"""
    data = self.ticker.history(period=self.period, interval=self.interval)
    df_data = pd.DataFrame(data)

    if verbose:
        display("Historical Data:")
        display(df_data.head())
        print(df_data.shape)

    return df_data

YahooFinanceDataFetcher.get_raw_data = get_raw_data

In [37]:
def get_financials(self, verbose=False):
    financials = self.ticker.financials
    df_financials = pd.DataFrame(financials).T

    if verbose:
        print("Financials:")
        display(df_financials)
        print(df_financials.shape)

    return df_financials

YahooFinanceDataFetcher.get_financials = get_financials

In [38]:
def get_actions(self, verbose=False):
    actions = self.ticker.actions
    df_actions = pd.DataFrame(actions)

    if verbose:
        print("Actions:")
        display(df_actions)
        print(df_actions.shape)

    return df_actions

YahooFinanceDataFetcher.get_actions = get_actions

In [39]:
def create_lag(df, column, lookback, include_zero=True):
    """
    Shifts 'column' a maximum of 'lookback' timesteps into the past, creating a new column for each 2^n steps
    ie. Creates column 'column-1', 'column-2', 'column-4' etc. until reaching 'lookback'
    If include_zero, will also include 'column-0' (original column)
    """
    assert column in df.columns
    if include_zero:
        df[f'{column}-0'] = df[column]

    for lag in range(1, lookback):
        if np.log2(lag) % 1 == 0:
            df[f'{column}-{lag}'] = df[column].shift(lag)

In [47]:
def add_max(df, column, lookback, name_append=None):
    """Add the max value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    df[f'{column} max{name_append}'] = df[column].rolling(lookback).max()

In [None]:
def add_min(df, column, lookback, name_append=None):
    """Add the min value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    df[f'{column} min{name_append}'] = df[column].rolling(lookback).min()

In [None]:
def add_mean(df, column, lookback, name_append=None):
    """Add the mean value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    df[f'{column} mean{name_append}'] = df[column].rolling(lookback).mean()

In [48]:
from re import A
def get_X_y(self, reload=False, test=False, verbose=False,
            lookback_y=10,
            lookback_other=10,
            short_lookback=10,
            add_short_max=["Open-0", "High", "Low", "Close"],
            add_short_min=["Open-0", "High", "Low", "Close"],
            add_short_mean=["Open-0", "High", "Low", "Close"],
            add_max=["Open-0", "High", "Low", "Close"],
            add_min=["Open-0", "High", "Low", "Close"],
            add_mean=["Open-0", "High", "Low", "Close"],
            ):
    """Return X and y dataframes"""
    # If the X and y have already been created
    if self.X is not None and self.y is not None:
        train_length = int(len(self.X) * self.train_frac)
        test_length = len(self.X) - train_length
        if test:
            return self.X.tail(test_length), self.y.tail(test_length)
        else:
            return self.X.head(train_length), self.y.head(train_length)

    df = self.get_raw_data(verbose=verbose)
    y_column = "Open"
    # Shift y by 1 to prevent lookahead bias (predict tomorrow's y)
    df[y_column].shift(1)

    original_columns = df.columns
    # Add previous days' values
    create_lag(df, y_column, lookback=lookback_y, include_zero=True)
    for column in original_columns:
        create_lag(df, column, lookback=lookback_other, include_zero=False)

    for column in add_short_max:
        add_max(df, column, lookback=short_lookback, name_append=f"_max{short_lookback}")
    for column in add_short_min:
        add_min(df, column, lookback=short_lookback, name_append=f"_min-{short_lookback}")
    for column in add_short_mean:
        add_mean(df, column, lookback=short_lookback, name_append=f"_mean-{short_lookback}")

    for column in add_max:
        add_max(df, column, lookback=lookback_other, name_append=f"_max-{lookback_other}")
    for column in add_min:
        add_min(df, column, lookback=lookback_other, name_append=f"_min-{lookback_other}")
    for column in add_mean:
        add_mean(df, column, lookback=lookback_other, name_append=f"_mean-{lookback_other}")


    # Drop rows with NaN values
    df.dropna(inplace=True)
    display(df)

    return None, None





YahooFinanceDataFetcher.get_X_y = get_X_y

data_fetcher = YahooFinanceDataFetcher("AAPL")

X, y = data_fetcher.get_X_y(verbose=True)

display(X, y)

'Historical Data:'

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1980-12-12 00:00:00-05:00,0.098834,0.099264,0.098834,0.098834,469033600,0.0,0.0
1980-12-15 00:00:00-05:00,0.094108,0.094108,0.093678,0.093678,175884800,0.0,0.0
1980-12-16 00:00:00-05:00,0.087232,0.087232,0.086802,0.086802,105728000,0.0,0.0
1980-12-17 00:00:00-05:00,0.088951,0.089381,0.088951,0.088951,86441600,0.0,0.0
1980-12-18 00:00:00-05:00,0.09153,0.091959,0.09153,0.09153,73449600,0.0,0.0


TypeError: 'tuple' object is not callable

In [None]:
# data_fetcher = YahooFinanceDataFetcher("AAPL", period="1d")

# df_data = data_fetcher.get_data()

# display(df_data)

# Archive

In [None]:
# def save_data(self, verbose=False):
#     """Saves data to csv file. Saves as 'ticker-period-interval.csv'"""
#     file_name = f"{self.ticker_symbol}-{self.period}-{self.interval}.csv"
#     df = self.get_raw_data(reload=True, verbose=False)
#     df.to_csv(file_name)
#     if verbose:
#         print(f"Saved data as {file_name}")

# YahooFinanceDataFetcher.save_data = save_data

In [None]:
# def get_raw_data(self, reload=False, verbose=False):
#     """Get historical data from Yahoo Finance. If reload=False, gets saved data from GitHub if available"""
#     if not reload:
#         # Try in order: get data from current file structure, get data from GitHub, get data from yfinance
#         try:
#             if verbose:
#                 print("Loading saved data")
#             data = pd.read_csv(f"{self.ticker_symbol}-{self.period}-{self.interval}.csv")
#         except FileNotFoundError:
#             try:
#                 if verbose:
#                     print("No saved data found. Connecting to GitHub")
#                 !git clone https://github.com/luck058/yfinance-prediction
#                 %cd yfinance-prediction
#                 data = pd.read_csv(f"{self.ticker_symbol}-{self.period}-{self.interval}.csv")
#             except FileNotFoundError:
#                 if verbose:
#                     print("No saved data found. Reloading data from Yahoo Finance")
#                 data = self.ticker.history(period=self.period, interval=self.interval)
#     else:
#         if verbose:
#             print("Reloading data from Yahoo Finance. This will not be saved. Use save_data() to save to GitHub")
#         data = self.ticker.history(period=self.period, interval=self.interval)

#     df_data = pd.DataFrame(data)

#     if verbose:
#         display("Historical Data:")
#         display(df_data.head())

#     return df_data

# YahooFinanceDataFetcher.get_raw_data = get_raw_data