<a href="https://colab.research.google.com/github/luck058/yfinance-prediction/blob/main/import_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
%%capture
!pip install yfinance

In [33]:
import yfinance as yf
import pandas as pd
import numpy as np

In [34]:
def help():
    print('# Set up data fetcher using:')
    print('data_fetcher = YahooFinanceDataFetcher(self, ticker_symbol, period="max", interval="1d")\n')
    print('# Get historical data using:')
    print('df_data = data_fetcher.get_data(verbose=False)\n')
    print('# Get financials using:')
    print('df_financials = data_fetcher.get_financials(verbose=False)\n')
    print('# Get actions using:')
    print('df_actions = data_fetcher.get_actions(verbose=False)\n')

In [35]:
class YahooFinanceDataFetcher:
    """All functions defined in this notebook are added to this class"""
    def __init__(self, ticker_symbol, period="max", interval="1d", train_frac=0.8):
        self.ticker_symbol = ticker_symbol
        self.ticker = yf.Ticker(ticker_symbol)
        self.period = period
        self.interval = interval
        # Fraction of the data which will be used for training (last section will be reserved for testing)
        self.train_frac = train_frac
        self.X = None
        self.y = None

In [46]:
def get_raw_data(self, verbose=False):
    """Get historical data from Yahoo Finance. If reload=False, gets saved data from GitHub if available"""
    data = self.ticker.history(period=self.period, interval=self.interval)
    df_data = pd.DataFrame(data)

    if verbose:
        display("Historical Data:")
        display(df_data.head())
        print(df_data.shape())

    return df_data

YahooFinanceDataFetcher.get_raw_data = get_raw_data

In [37]:
def get_financials(self, verbose=False):
    financials = self.ticker.financials
    df_financials = pd.DataFrame(financials).T

    if verbose:
        print("Financials:")
        display(df_financials)
        print(df_financials.shape())

    return df_financials

YahooFinanceDataFetcher.get_financials = get_financials

In [38]:
def get_actions(self, verbose=False):
    actions = self.ticker.actions
    df_actions = pd.DataFrame(actions)

    if verbose:
        print("Actions:")
        display(df_actions)
        print(df_actions.shape())

    return df_actions

YahooFinanceDataFetcher.get_actions = get_actions

In [39]:
def create_lag(df, column, lookback, include_zero=True):
    """
    Shifts 'column' a maximum of 'lookback' timesteps into the past, creating a new column for each 2^n steps
    ie. Creates column 'column-1', 'column-2', 'column-4' etc. until reaching 'lookback'
    If include_zero, will also include 'column-0' (original column)
    """
    assert column in df.columns
    if include_zero:
        df[f'{column}-0'] = df[column]

    for lag in range(1, lookback):
        if np.log2(lag) % 1 == 0:
            df[f'{column}-{lag}'] = df[column].shift(lag)

In [47]:
def add_max(df, column, lookback, name_append=None):
    """Add the max value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    df[f'{column} max{name_append}'] = df[column].rolling(lookback).max()

In [None]:
def add_min(df, column, lookback, name_append=None):
    """Add the min value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    df[f'{column} min{name_append}'] = df[column].rolling(lookback).min()

In [None]:
def add_min_max(df, column, lookback, name_append=None):
    """Add the min and max value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    add_max(df, column, lookback, name_append=name_append)
    add_min(df, column, lookback, name_append=name_append)

In [None]:
def add_mean(df, column, lookback, name_append=None):
    """Add the mean value in the last 'lookback' timesteps to df"""
    assert column in df.columns
    df[f'{column} mean{name_append}'] = df[column].rolling(lookback).mean()

In [45]:
def get_X_y(self, reload=False, test=False, verbose=False,
            lookback_y=10,
            lookback_other=10
            ):
    """Return X and y dataframes"""
    # If the X and y have already been created
    if self.X is not None and self.y is not None:
        train_length = int(len(self.X) * self.train_frac)
        test_length = len(self.X) - train_length
        if test:
            return self.X.tail(test_length), self.y.tail(test_length)
        else:
            return self.X.head(train_length), self.y.head(train_length)

    df = self.get_raw_data(verbose=verbose)
    y_column = "Open"
    # Shift y by 1 to prevent lookahead bias (predict tomorrow's y)
    df[y_column].shift(1)

    original_columns = df.columns
    # Add previous days' values
    create_lag(df, y_column, lookback=lookback_y, include_zero=True)
    for column in original_columns:
        create_lag(df, column, lookback=lookback_other, include_zero=False)

    # Drop rows with NaN values
    df.dropna(inplace=True)
    display(df)

    return None, None





YahooFinanceDataFetcher.get_X_y = get_X_y

data_fetcher = YahooFinanceDataFetcher("AAPL")

X, y = data_fetcher.get_X_y(verbose=True)

display(X, y)

'Historical Data:'

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1980-12-12 00:00:00-05:00,0.098834,0.099264,0.098834,0.098834,469033600,0.0,0.0
1980-12-15 00:00:00-05:00,0.094108,0.094108,0.093678,0.093678,175884800,0.0,0.0
1980-12-16 00:00:00-05:00,0.087232,0.087232,0.086802,0.086802,105728000,0.0,0.0
1980-12-17 00:00:00-05:00,0.088951,0.089381,0.088951,0.088951,86441600,0.0,0.0
1980-12-18 00:00:00-05:00,0.09153,0.091959,0.09153,0.09153,73449600,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11124 entries, 1980-12-12 00:00:00-05:00 to 2025-01-30 00:00:00-05:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          11124 non-null  float64
 1   High          11124 non-null  float64
 2   Low           11124 non-null  float64
 3   Close         11124 non-null  float64
 4   Volume        11124 non-null  int64  
 5   Dividends     11124 non-null  float64
 6   Stock Splits  11124 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 695.2 KB
None


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Open-0,Open-1,Open-2,...,Volume-4,Volume-8,Dividends-1,Dividends-2,Dividends-4,Dividends-8,Stock Splits-1,Stock Splits-2,Stock Splits-4,Stock Splits-8
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980-12-24 00:00:00-05:00,0.111726,0.112156,0.111726,0.111726,48003200,0.0,0.0,0.111726,0.106140,0.101842,...,73449600.0,469033600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980-12-26 00:00:00-05:00,0.122039,0.122469,0.122039,0.122039,55574400,0.0,0.0,0.122039,0.111726,0.106140,...,48630400.0,175884800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980-12-29 00:00:00-05:00,0.123758,0.124188,0.123758,0.123758,93161600,0.0,0.0,0.123758,0.122039,0.111726,...,37363200.0,105728000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980-12-30 00:00:00-05:00,0.121180,0.121180,0.120750,0.120750,68880000,0.0,0.0,0.121180,0.123758,0.122039,...,46950400.0,86441600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980-12-31 00:00:00-05:00,0.117742,0.117742,0.117313,0.117313,35750400,0.0,0.0,0.117742,0.121180,0.123758,...,48003200.0,73449600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-24 00:00:00-05:00,224.779999,225.630005,221.410004,222.779999,54697900,0.0,0.0,224.779999,224.740005,219.789993,...,68488300.0,49630700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-01-27 00:00:00-05:00,224.020004,232.149994,223.979996,229.860001,94863400,0.0,0.0,224.020004,224.779999,224.740005,...,98070400.0,39435300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-01-28 00:00:00-05:00,230.850006,240.190002,230.809998,238.259995,75707600,0.0,0.0,230.850006,224.020004,224.779999,...,64126500.0,39832000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-01-29 00:00:00-05:00,234.119995,239.860001,234.009995,239.360001,45375500,0.0,0.0,234.119995,230.850006,224.020004,...,60234800.0,71759100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


None

None

In [None]:
# data_fetcher = YahooFinanceDataFetcher("AAPL", period="1d")

# df_data = data_fetcher.get_data()

# display(df_data)

# Archive

In [None]:
# def save_data(self, verbose=False):
#     """Saves data to csv file. Saves as 'ticker-period-interval.csv'"""
#     file_name = f"{self.ticker_symbol}-{self.period}-{self.interval}.csv"
#     df = self.get_raw_data(reload=True, verbose=False)
#     df.to_csv(file_name)
#     if verbose:
#         print(f"Saved data as {file_name}")

# YahooFinanceDataFetcher.save_data = save_data

In [None]:
# def get_raw_data(self, reload=False, verbose=False):
#     """Get historical data from Yahoo Finance. If reload=False, gets saved data from GitHub if available"""
#     if not reload:
#         # Try in order: get data from current file structure, get data from GitHub, get data from yfinance
#         try:
#             if verbose:
#                 print("Loading saved data")
#             data = pd.read_csv(f"{self.ticker_symbol}-{self.period}-{self.interval}.csv")
#         except FileNotFoundError:
#             try:
#                 if verbose:
#                     print("No saved data found. Connecting to GitHub")
#                 !git clone https://github.com/luck058/yfinance-prediction
#                 %cd yfinance-prediction
#                 data = pd.read_csv(f"{self.ticker_symbol}-{self.period}-{self.interval}.csv")
#             except FileNotFoundError:
#                 if verbose:
#                     print("No saved data found. Reloading data from Yahoo Finance")
#                 data = self.ticker.history(period=self.period, interval=self.interval)
#     else:
#         if verbose:
#             print("Reloading data from Yahoo Finance. This will not be saved. Use save_data() to save to GitHub")
#         data = self.ticker.history(period=self.period, interval=self.interval)

#     df_data = pd.DataFrame(data)

#     if verbose:
#         display("Historical Data:")
#         display(df_data.head())

#     return df_data

# YahooFinanceDataFetcher.get_raw_data = get_raw_data