# Automating Stock Market Data Extraction & Load
### Extraction from yfinance module, load to Excel workbook

In [1]:
# Importing the relevant modules and libraries

import yfinance as yf # Module containing stock market data --> where we'll extract our figures from 
import pandas as pd   # General purpose data library
import openpyxl       # Work with Excel documents
import datetime       # Working with + formatting datetime objects

In [93]:
# Defining yesterday's date and storing it in a formatted string, under the YYYY-MM-DD format
# The formatted string containing yesterday's date is then stored in the 'yesterday' variable

today = pd.to_datetime(datetime.date.today())
yesterday = (today - datetime.timedelta(days=1)).strftime('%Y-%m-%d')

>In order to extract data from the intended stock tickers, we first need to __define what tickers we want to fetch data for__;
>* This information will then be stored in the `stocks_list` list-object

In [96]:
stocks_list = ['AAPL', 'MSFT']

## Extraction  - `yfinance` module

In [99]:
df = yf.download(tickers=stocks_list,
                   period='5d',
                   multi_level_index=False)

[*********************100%***********************]  2 of 2 completed


In [101]:
df

Price,Close,Close,High,High,Low,Low,Open,Open,Volume,Volume
Ticker,AAPL,MSFT,AAPL,MSFT,AAPL,MSFT,AAPL,MSFT,AAPL,MSFT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2025-04-17,196.979996,367.779999,198.830002,374.320007,194.419998,366.890015,197.199997,373.75,51334300,20943700
2025-04-21,193.160004,359.119995,193.800003,364.480011,189.809998,355.670013,193.270004,362.820007,46742500,20807300
2025-04-22,199.740005,366.820007,201.589996,367.769989,195.970001,359.859985,196.119995,363.380005,52976400,19485000
2025-04-23,204.600006,374.390015,208.0,380.390015,202.800003,373.019989,206.0,376.059998,52863100,20530000
2025-04-24,207.869995,387.119995,208.100006,387.350006,202.940002,375.190002,204.884995,375.98999,23872309,10493581


### Quick Thought on Dates:
>Leave the `period` parameter set to '5d' within yf.download()
>Within the `transform_df()` function, BEFORE the .stack() and .unstack() methods coming, check if df.index[-1] (`datetime` object - most recent date in the extraction) is equal to the `today` variable:
>* If it IS equal, then keep the day before THAT only (df.index[-2])
>* If it ISN'T equal, then keep df.index[-1] only

In [120]:
# As explained above --> if today matches the last date in the extracted data, then only keep the day before that
## This is to make sure we don't download the current day's data and thus fetch still-live stock exchange figures -- We want to fetch only the previous day's (whatever date that was) finalised values

# As such, if today does NOT match the last date in the extracted data (going into the else clause), then extract the last day available in the extraction!
## Instances where today does NOT match the last date in the extraction would occur when the extraction is done before the market opens --> the most recent date would then naturally NOT be today,
## but rather a previous, already finalised date 

if today == df.index[-1]:
    day_to_fetch = -2
else:
    day_to_fetch = -1

In [112]:
## Keeping only the intended date LIKE SO!

df = df.loc[[df.index[day_to_fetch]]]
df

Price,Close,Close,High,High,Low,Low,Open,Open,Volume,Volume
Ticker,AAPL,MSFT,AAPL,MSFT,AAPL,MSFT,AAPL,MSFT,AAPL,MSFT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2025-04-23,204.600006,374.390015,208.0,380.390015,202.800003,373.019989,206.0,376.059998,52863100,20530000


In [57]:
pd.to_datetime('2025-04-24') == df.index[-1]

True

In [55]:
df.index[-1]

Timestamp('2025-04-24 00:00:00')

## Transformation - Cleaning the Extracted Data

In [14]:
def transform_df(param_df):
    '''
    This function will take a df in as an input (param_df), and apply all necessary changes to it;
    
    Steps to be taken are:
    - Create a copy of the dataframe for added safety
    - Using a combination of .unstack(), .stack(), and .reset_index() to convert the originally-MultiIndex df into a simpler, neater format
    - Drop the 'Volume' column as it is not necessary for our end goal
    - Rename both axes (0 and 1 - rows and columns) to None, to make the dataframe look neater with no unnecessary axis names
    - Round all columns to 2 decimal places
    - Reorder the columns so they match the destination Excel file
    
    Finally, this function will end by returning the altered dataframe
    '''
    df_copy = param_df.copy()                                         # Creating a copy of the original df
    df_copy = df_copy.unstack().to_frame().unstack(level=0)\
    .stack(level=0, future_stack=True).reset_index('Date')\
    .reset_index(level=1, drop=True)                                  # Using .unstack() and .stack() to get the originally-MultiIndex df to the right format
                                                                      # Also using future_stack=True in .stack() to prevent a FutureWarning --> avoiding deprecation in future pandas versions!
    
    df_copy.drop(columns='Volume', inplace=True)                      # Dropping the Volume column (unnecessary)
    
    df_copy.rename_axis(index=None, columns=None, inplace=True)       # Renaming both the index and olumn axes to None --> dropping unnecessary axis names

    for i in df_copy.drop(columns='Date').columns:
        df_copy[i] = df_copy[i].apply(lambda x: round(x, 2))          # Looping through the columns in our df (except 'Date') and using a lambda function to round all figures to 2 decimals
                                                                      # Note: the 'Date' column isn't actually dropped from the df as we're not using inplace=True --> it is only a 'temporary'
                                                                      # drop so we don't attempt to round the 'Date' column --> can't round a datetime object

    df_copy = df_copy[['Date', 'Open', 'High', 'Low', 'Close']]       # Reordering the columns so they match the order in the destination Excel file

    return df_copy                                                    # Finally, returning the altered df

# Note: Need to Rewrite the Markdown Below

>Having defined the full transform_df function above with all necessary changes, we __now apply it to our original dataframe__ and __permanently change it__
>
>However, so as to __avoid unnecessarily running the function twice and overwriting the intended changes__, we will first make sure the __altered version of the dataframe does not yet exist__
>
>* If it __needs creating__, the newly-altered df will be saved in `new_df`;
>* Hence, we will first check if `new_df in locals():`
>    * `locals()` is a dictionary-type object which contains all the existing variables in the file
>* If the newly-altered dataframe __has already been created__, checking `new_df in locals()` will return __`True`:__
>    * Should be the case, we will print a warning message stating that the df exists already and thus the function will not be ran again --> thus avoiding overwriting an already-changed df
>* If `new_df in locals()` returns False - and hence the dataframe has not yet been created - we move to the `else statement` in the if-else block and thus effectively __create new_df__

In [18]:
if 'new_df' in locals():
    del new_df
else:
    pass

new_df = transform_df(df)

>We now __check new_df__ to make sure it __looks exactly as intended:__

In [21]:
new_df

Unnamed: 0,Date,Open,High,Low,Close
AAPL,2025-04-22,196.07,201.55,196.0,199.74
MSFT,2025-04-22,363.38,367.76,359.86,366.82


>**Success!**
>* The altered DataFrame is now in a __much simpler and neater format to work with__, where the __stock tickers__ are the <u>index labels</u> and the <u>only level of columns</u> are the __Date__ of the extracted figures and the __4 figures__ themselves (Open, High, Low, Close)
>* These 4 figures are __what we want to load into the destination Excel workbook__
>* The Date will be used __simply to match the correct row on the destination file__ -- The data will be <u>appended where the Date in new_df matches the Date column in the destination file</u>

## Load to Excel - WIP

>The __`Load to Excel`__ stage will be comprised of two main stages:
>
>**1. Reading the existing destination file:**
>    * When appending the data to the destination file, we will need to append it to the correct row;
>    * The correct row will be the one where the Date matches that of the previous stock data extraction;
>        * As in, the destination file contains a Date column (which is aliased as the ticker name, lower-cased) --> this Date column is already populated with future dates. We will want to append the data onto the file only on the specific row where the Date matches the one in our previously done extraction;
>    * As such, the reading of the destination file as crucial to __work out the correct row number to append the data to later on__
>
>**2. Loading the data to the existing destination file:**
>* Once we've worked out what the correct row number to append our data to is, we will then move forward with the load of the data into the destination file, onto the correct row

### Reading the Destination File

>**Key Note 1:**
>* The destination file contains __separate sheets for each stock__
>* The __sheet names__ are the <u>stock tickers, lower-cased</u> (e.g. the sheet with data for the AAPL ticker is named 'aapl')
>* We will access each sheet by __looping through the elements__ in `stocks_list` (the list of tickers we downloaded data from, defined at the start of the script) 
>
>**Steps:**
>
>* Save the file path within the `path` variable; 
>* Use `.read_excel()` to access each sheet in the document;
>    * __sheet_name__ must be passed and equaled to the the ticker name (__lowered tring__);
>    * __parse_dates__ must be set to `True`, date_format must be set to `%Y-%m-%d`
>* Fetch the correct row number to later append the data to;
>    * Do so by __working out where the Date value in the destination file <u>matches</u> the Date value of our extracted data__
> 
>**Key Note 2:**
>* We will be using `boolean masking` to work out the row where the Dates match in the Excel file --> once the row where they match has been found, we will use the `.index` attribute, followed by the [0] subscripting operator --> thus getting back an integer, which represents the row number in question.
>* **However:**
>    * Due to the formatting of the destination file (__headers take up 2 rows rather than just one__) AND the fact that __Python is 0-indexed whereas Excel starts each sheet at row 1__, we will then need to __add 2 to the integer we get back from the boolean masking operation__
>    * E.g. if we find that the Dates match on row 5, that means that, __on the destination file__, the Dates would __actually match on row 7__ (5+2)
>
>**Key Note 3:**
>* The separate sheets for each stock ticker follow the __exact same structure__, they're essentially __exact copies of each other__, just with <u>different figures populating the cells</u>
>* As such, the correct row number will be __the exact same for _ALL_ sheets__
>* Due to this, we will __only need to read _ONE_ of the sheets__ in the destination file, as this will give us the correct row number for ___ALL___

In [28]:
# Saving the file path from the current directory

final_file = 'CFDv1 (2020-2025).xlsx'

In [30]:
# Using .read_excel() to fetch the data for each sheet in the destination file

xl = pd.read_excel(final_file,                              # Path for destination file
                   sheet_name=stocks_list[0].lower(), # Fetching only the first ticker in stocks_list (as we only need to read 1 sheet for this stage
                                                      # Also lowering the string as names are lowered in the destination file 
                   parse_dates=True,                  # Parsing dates so we can work with them as datetime objects
                   date_format='%Y-%m-%d')            # Defining the format dates are stored in on the file

In [32]:
# Looking at the first 10 rows of the destination file

xl.head(10)

Unnamed: 0,aapl,opp LTT 21,opp LTT 52,indic sl 21,indic sl52,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 31,10.1,7.3,profit 4,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40
0,,0,0,10,21,,INDICADORES,,,,...,,65.967332,-12.345492,53.621839,,,,,,
1,date,open,high,low,close,ATR,b52,s52,b21,s21,...,LTT trend,LTT s21 ref,LTT s21 valid,profit_b21,TT_s21,low<s21,e_s21,sl_s21,sl_s21_ATR,exit_s21
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,2019-10-01 00:00:00,56.267502,57.055,56.049999,56.147499,0.071786,,,,,...,,,,,,,,,,


>Even though the file is structured in a strange manner when read using Pandas, __we will not be transforming it or cleaning it up in any way__;
>* This is because we want to keep the file __exactly as it is__ so we can work out what the correct row number to append our data is, __without having changed it through any cleaning or transformations__
>
>__This is crucial to ensure accuracy in the integer we get back representing the correct row number__

In [35]:
# Using boolean masking to work out what row the Dates match in
# Then using .index followed by [0] to fetch the integer representing the correct row number
# Storing the number in row_number

row_number = xl[xl['aapl'] == new_df.loc['AAPL']['Date']].index[0]
row_number

1422

>**Reminder:**
>* As mentioned above, we now need to __add 2 to row_number to ensure accuracy__
>* E.g. getting back 5 from the boolean masking operation means the correct row is actually 7 (5+2)

>Additionally, we will now store the revised row number (having added 2 to it) in `correct_row`
>* To ensure we don't accidentally add 2 to it several times and thus damage the reliability of this operation, we will:
>* Check if correct_row exists already
>    * If it doesn't:
>        * We will create it and add 2 to `row_number`, storing `row_number` + 2 in `correct_row`
>    * If it does:
>        * We will check if correct_row - row_number == 2:
>            * In doing this, we are making sure that we have in fact only added 2 to the original row_number and not more than that
>            * If that returns __True__ (`correct_row` exists already _AND_ the difference between it and `row_number` __is in fact 2__), we will __do nothing else__ (`pass`)
>            * If that returns __False__ (`correct_row` exists already _BUT_ the difference between it and `row_number` __is NOT 2__), we will once more __execute `correct_row` = `row_number` + 2 to __ensure it is the correct number, and then pass__

In [37]:
if 'correct_row' not in locals(): # correct_row does NOT yet exist
    correct_row = row_number + 1
else: # correct_row DOES already exist
    if (correct_row - row_number) == 1:
        pass
    else:
        correct_row = row_number + 1

print(f'Difference between correct_number and the original row_number: {correct_row - row_number}')
print(f'Correct row: {correct_row}')

Difference between correct_number and the original row_number: 1
Correct row: 1423


>Now that we've successfully worked out what the correct row number is to append our data to, we will move on to the `loading to destination file` stage! 

>**Key Note:**
>* The separate sheets for each stock ticker follow the __exact same structure__, they're essentially __exact copies of each other__, just with <u>different figures populating the cells</u>
>* As such, the correct row number will be __the exact same for _ALL_ sheets__
>* Due to this, we __needed to read only _ONE_ sheet__ in the previous stage, as this will give us the correct row number for ___ALL___

### Loading the Data to Excel

>Intermediate file first

In [59]:
intermediate_file = 'output_file.xlsx'

In [63]:
with pd.ExcelWriter(intermediate_file, mode='w') as writer:
    
    for ticker in stocks_list:
        data_to_append = pd.DataFrame(new_df.drop(columns='Date').loc[ticker]).transpose()
        data_to_append.to_excel(writer, engine='openpyxl', sheet_name=ticker, index=False, header=False)

>__Success in appending to intermediate file!__
>
>Now moving on to copying the intended cells from the intermediate and pasting into the desired range within the destination file 

In [45]:
# wb = openpyxl.load_workbook(path)

In [91]:
intermediate_workbook = openpyxl.load_workbook(intermediate_file)
final_workbook = openpyxl.load_workbook(final_file)

In [111]:
for ticker in stocks_list:
    source_sheet = intermediate_workbook[ticker]
    target_sheet = final_workbook[ticker.lower()] # sheet names in the final destination file are the lower-cased ticker names

    source_range = source_sheet['A1:D1'] # The range to copy from in the intermediate file will always be [A1:D1] for every single sheet
    start_col_dest_file = 2 # Paste the data starting from column B in the destination file

    for col_offset, source_cell in enumerate(source_range[0]):  # source_range[0] gets the first (and only) row
                target_column = start_col_dest_file + col_offset # col_offset will loop through the 4 columns in the data (A to D), with numerical values 0 to 3
                target_cell = target_sheet.cell(row=correct_row, column=target_column)
                target_cell.value = source_cell.value
    final_workbook.save(final_file)

## One-File Solution!!!

In [39]:
with pd.ExcelWriter(final_file, mode='a', if_sheet_exists='overlay') as writer:
    wb = writer.book

    for ticker in stocks_list:
        data_to_append = pd.DataFrame(new_df.drop(columns='Date').loc[ticker]).transpose()
        data_to_append.to_excel(excel_writer=writer, engine='openpyxl', sheet_name=ticker.lower(), index=False, header=False, startrow=correct_row, startcol=1)
    wb.save(final_file)
    wb.close()

# wb.close()

* Tentativa 2

In [None]:
for ticker in stocks_list:
    data_to_append = pd.DataFrame(new_df.drop(columns='Date').loc[ticker]).transpose()
    data_to_append.to_excel(excel_writer=writer, engine='xlsxwriter', sheet_name=ticker, index=False, header=False, startrow=correct_row, startcol=1)

# Thoughts

>Note that creating an ExcelWriter object with a file name that already exists will result in the contents of the existing file being erased.

> Because of the above note, it might be a good idea to first ship each dataframe to an entirely new excel file, and THEN use pywin32 to copy the cells and paste them on the ACTUAL destination file

# Tests down below

In [93]:
with pd.ExcelWriter(path, mode="a", engine="openpyxl") as writer:
    aapl_df.to_excel(writer, sheet_name='aapl')  

KeyboardInterrupt: 

In [560]:
for stock in stocks_list:
    print(df.loc[stock])
    print('\n')

Close    196.98
High     198.83
Low      194.42
Open     197.20
Name: AAPL, dtype: float64


Close    241.37
High     244.34
Low      237.68
Open     243.47
Name: TSLA, dtype: float64


