<a href="https://colab.research.google.com/github/luck058/yfinance-prediction/blob/main/import_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install yfinance

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np

In [3]:
def help():
    print('# Set up data fetcher using:')
    print('data_fetcher = YahooFinanceDataFetcher(self, ticker_symbol, period="max", interval="1d")\n')
    print('# Get historical data using:')
    print('df_data = data_fetcher.get_data(verbose=False)\n')
    print('# Get financials using:')
    print('df_financials = data_fetcher.get_financials(verbose=False)\n')
    print('# Get actions using:')
    print('df_actions = data_fetcher.get_actions(verbose=False)\n')

In [4]:
class YahooFinanceDataFetcher:
    """All functions defined in this notebook are added to this class"""
    def __init__(self, ticker_symbol, period="max", interval="1d", train_frac=0.8):
        self.ticker_symbol = ticker_symbol
        self.ticker = yf.Ticker(ticker_symbol)
        self.period = period
        self.interval = interval
        # Fraction of the data which will be used for training (last section will be reserved for testing)
        self.train_frac = train_frac
        self.X = None
        self.y = None

In [5]:
def get_raw_data(self, verbose=False):
    """Get historical data from Yahoo Finance"""
    data = self.ticker.history(period=self.period, interval=self.interval)
    df_data = pd.DataFrame(data)

    if verbose:
        display("Historical Data:")
        display(df_data.head())
        print(df_data.shape)

    return df_data

YahooFinanceDataFetcher.get_raw_data = get_raw_data

In [6]:
def get_financials(self, verbose=False):
    financials = self.ticker.financials
    df_financials = pd.DataFrame(financials).T

    if verbose:
        print("Financials:")
        display(df_financials)
        print(df_financials.shape)

    return df_financials

YahooFinanceDataFetcher.get_financials = get_financials

In [7]:
def get_actions(self, verbose=False):
    actions = self.ticker.actions
    df_actions = pd.DataFrame(actions)

    if verbose:
        print("Actions:")
        display(df_actions)
        print(df_actions.shape)

    return df_actions

YahooFinanceDataFetcher.get_actions = get_actions

In [8]:
def create_lag(df, column, lookback, include_zero=True):
    """
    Shifts 'column' a maximum of 'lookback' timesteps into the past, creating a new column for each 2^n steps
    ie. Creates column 'column-1', 'column-2', 'column-4' etc. until reaching 'lookback'
    If include_zero, will also include 'column-0' (original column)
    """
    assert column in df.columns
    if include_zero:
        df[f'{column}-0'] = df[column]

    for lag in range(1, lookback):
        if np.log2(lag) % 1 == 0:
            df[f'{column}-{lag}'] = df[column].shift(lag)

In [9]:
def add_max(df, column, lookback, name_append=None):
    """Add the max value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    df[f'{column} max{name_append}'] = df[column].rolling(lookback).max()

In [10]:
def add_min(df, column, lookback, name_append=None):
    """Add the min value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    df[f'{column} min{name_append}'] = df[column].rolling(lookback).min()

In [11]:
def add_mean(df, column, lookback, name_append=None):
    """Add the mean value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    df[f'{column} mean{name_append}'] = df[column].rolling(lookback).mean()

In [28]:
def get_X_y(self, reload=False, test=False, verbose=False,
            lookback_y=10,
            lookback_other=10,
            short_lookback=3,
            add_short_max_cols=["Open-0", "High", "Low", "Close"],
            add_short_min_cols=["Open-0", "High", "Low", "Close"],
            add_short_mean_cols=["Open-0", "High", "Low", "Close"],
            add_max_cols=["Open-0", "High", "Low", "Close"],
            add_min_cols=["Open-0", "High", "Low", "Close"],
            add_mean_cols=["Open-0", "High", "Low", "Close"],
            ):
    """Return X and y dataframes"""
    # If the X and y have not already been created
    if self.X is None or self.y is None or reload==True:
        if verbose:
            print("Creating X and y dataframes")

        df = self.get_raw_data(verbose=verbose)
        y_column = "Open"

        original_columns = df.columns
        # Add previous days' values
        create_lag(df, y_column, lookback=lookback_y, include_zero=True)
        for column in original_columns:
            create_lag(df, column, lookback=lookback_other, include_zero=False)

        # Add the max/min/mean value of each value in 'add_short_x_cols' in the last 'short_lookback' days
        # Eg. maximum open price in the last 3 days
        for column in add_short_max_cols:
            add_max(df, column, lookback=short_lookback, name_append=f"-{short_lookback}")
        for column in add_short_min_cols:
            add_min(df, column, lookback=short_lookback, name_append=f"-{short_lookback}")
        for column in add_short_mean_cols:
            add_mean(df, column, lookback=short_lookback, name_append=f"-{short_lookback}")

        # Add the max/min/mean value of each value in 'add_long_x_cols' in the last 'lookback_other' days
        for column in add_max_cols:
            add_max(df, column, lookback=lookback_other, name_append=f"-{lookback_other}")
        for column in add_min_cols:
            add_min(df, column, lookback=lookback_other, name_append=f"-{lookback_other}")
        for column in add_mean_cols:
            add_mean(df, column, lookback=lookback_other, name_append=f"-{lookback_other}")

        # Shift y by 1 to prevent lookahead bias (predict tomorrow's y)
        df[y_column] = df[y_column].shift(-1)

        # Drop rows with NaN values
        df.dropna(inplace=True)

        self.X = df.drop(columns=[y_column])
        self.y = df[y_column]


    if verbose:
        display(self.X)
        display(self.y)

    train_length = int(len(self.X) * self.train_frac)
    test_length = len(self.X) - train_length

    # Only return test (most recent data)/ train (older data) data depending on what is needed
    if test:
        return self.X.tail(test_length), self.y.tail(test_length)
    else:
        return self.X.head(train_length), self.y.head(train_length)


YahooFinanceDataFetcher.get_X_y = get_X_y

In [29]:
# data_fetcher = YahooFinanceDataFetcher("AAPL", period="1mo", interval="1d")

# X, y = data_fetcher.get_X_y(reload=True, verbose=False)

# print("----------------------------")

# display(X)
# display(y)


----------------------------


Unnamed: 0_level_0,High,Low,Close,Volume,Dividends,Stock Splits,Open-0,Open-1,Open-2,Open-4,...,Low max-10,Close max-10,Open-0 min-10,High min-10,Low min-10,Close min-10,Open-0 mean-10,High mean-10,Low mean-10,Close mean-10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-01-21 00:00:00-05:00,224.419998,219.380005,222.639999,98070400,0.0,0.0,224.0,232.119995,237.350006,234.75,...,243.199997,245.0,224.0,224.419998,219.380005,222.639999,236.560999,238.122,233.011,235.318999
2025-01-22 00:00:00-05:00,224.119995,219.789993,223.830002,64126500,0.0,0.0,219.789993,224.0,232.119995,234.639999,...,241.350006,242.699997,219.789993,224.119995,219.380005,222.639999,234.108998,235.800999,230.67,233.201999
2025-01-23 00:00:00-05:00,227.029999,222.300003,223.660004,60234800,0.0,0.0,224.740005,219.789993,224.0,237.350006,...,240.050003,242.699997,219.789993,224.119995,219.380005,222.639999,232.284999,233.948999,228.764999,231.346999
2025-01-24 00:00:00-05:00,225.630005,221.410004,222.779999,54697900,0.0,0.0,224.779999,224.740005,219.789993,232.119995,...,234.429993,237.869995,219.789993,224.119995,219.380005,222.639999,230.570999,232.140999,226.900999,229.354999
2025-01-27 00:00:00-05:00,232.149994,223.979996,229.860001,94863400,0.0,0.0,224.020004,224.779999,224.740005,224.0,...,234.429993,237.869995,219.789993,224.119995,219.380005,222.639999,228.972,231.339998,225.998999,228.655998
2025-01-28 00:00:00-05:00,240.190002,230.809998,238.259995,75707600,0.0,0.0,230.850006,224.020004,224.779999,219.789993,...,234.429993,238.259995,219.789993,224.119995,219.380005,222.639999,228.704001,231.891998,226.107999,229.041998
2025-01-29 00:00:00-05:00,239.860001,234.009995,239.360001,45486100,0.0,0.0,234.119995,230.850006,224.020004,224.740005,...,234.429993,239.360001,219.789993,224.119995,219.380005,222.639999,228.641,232.265999,226.261998,229.649998
2025-01-30 00:00:00-05:00,240.789993,237.210007,237.589996,55658300,0.0,0.0,238.669998,234.119995,230.850006,224.779999,...,237.210007,239.360001,219.789993,224.119995,219.380005,222.639999,229.044,232.448997,226.539999,229.621999


Unnamed: 0_level_0,Open
Date,Unnamed: 1_level_1
2025-01-21 00:00:00-05:00,219.789993
2025-01-22 00:00:00-05:00,224.740005
2025-01-23 00:00:00-05:00,224.779999
2025-01-24 00:00:00-05:00,224.020004
2025-01-27 00:00:00-05:00,230.850006
2025-01-28 00:00:00-05:00,234.119995
2025-01-29 00:00:00-05:00,238.669998
2025-01-30 00:00:00-05:00,247.190002


# Archive

In [None]:
# def save_data(self, verbose=False):
#     """Saves data to csv file. Saves as 'ticker-period-interval.csv'"""
#     file_name = f"{self.ticker_symbol}-{self.period}-{self.interval}.csv"
#     df = self.get_raw_data(reload=True, verbose=False)
#     df.to_csv(file_name)
#     if verbose:
#         print(f"Saved data as {file_name}")

# YahooFinanceDataFetcher.save_data = save_data

In [None]:
# def get_raw_data(self, reload=False, verbose=False):
#     """Get historical data from Yahoo Finance. If reload=False, gets saved data from GitHub if available"""
#     if not reload:
#         # Try in order: get data from current file structure, get data from GitHub, get data from yfinance
#         try:
#             if verbose:
#                 print("Loading saved data")
#             data = pd.read_csv(f"{self.ticker_symbol}-{self.period}-{self.interval}.csv")
#         except FileNotFoundError:
#             try:
#                 if verbose:
#                     print("No saved data found. Connecting to GitHub")
#                 !git clone https://github.com/luck058/yfinance-prediction
#                 %cd yfinance-prediction
#                 data = pd.read_csv(f"{self.ticker_symbol}-{self.period}-{self.interval}.csv")
#             except FileNotFoundError:
#                 if verbose:
#                     print("No saved data found. Reloading data from Yahoo Finance")
#                 data = self.ticker.history(period=self.period, interval=self.interval)
#     else:
#         if verbose:
#             print("Reloading data from Yahoo Finance. This will not be saved. Use save_data() to save to GitHub")
#         data = self.ticker.history(period=self.period, interval=self.interval)

#     df_data = pd.DataFrame(data)

#     if verbose:
#         display("Historical Data:")
#         display(df_data.head())

#     return df_data

# YahooFinanceDataFetcher.get_raw_data = get_raw_data