In [1]:
import concurrent.futures
from utils.get_ticker_10k_filings import get_ticker_10k_filings
from utils.collect_ticker_files import collect_ticker_files
from utils.delete_txt_files import delete_txt_files
from utils.parse_html_file_mda import parse_html_file_mda
import os
import pandas as pd


# New Function: Process a single HTML file
def process_html_file(html_file, ticker):
    if html_file.endswith(".html"):
        path_parts = html_file.split("/")
        cik_year_acc = path_parts[4].split("-")

        if len(cik_year_acc) < 3:
            print(f"Skipping file with unexpected format: {html_file}")
            return None

        CIK = cik_year_acc[0]
        # Convert the two-digit year to four digits
        two_digit_year = cik_year_acc[1]
        Year = (
            "19" + two_digit_year if int(two_digit_year) > 50 else "20" + two_digit_year
        )
        AccessionNumber = cik_year_acc[2]

        try:
            parsed_data = parse_html_file_mda(html_file)
            filing_dict = {
                "ticker": ticker,
                # "cik": CIK,  # use our internal ticker to cik mapping to reduce errors
                "year": int(Year),
                # "accession_number": AccessionNumber,  # need to verify this is unique or not
                "mda_section": parsed_data,
            }
            return filing_dict
        except Exception as e:
            print(f"Could not parse {html_file} due to error: {e}")
            return None


# Modified Function: Process ticker 10-K data with parallel processing
def process_ticker_10k_data(ticker):
    try:
        get_ticker_10k_filings(ticker)
    except Exception as e:
        print(f"Error occurred while downloading filings for {ticker}: {e}")
        return {}

    ticker_files_dict = collect_ticker_files()
    delete_txt_files(ticker_files_dict.get(ticker, []))

    # Parallel processing of HTML files
    html_files = ticker_files_dict.get(ticker, [])
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(lambda file: process_html_file(file, ticker), html_files)
        all_parsed_data = {
            result["accession_number"]: result
            for result in results
            if result is not None
        }

    # Create a list of all parsed data dictionaries
    all_parsed_data_list = list(all_parsed_data.values())

    return all_parsed_data_list


import pandas as pd

# Initialize an empty DataFrame to store all tickers' data
all_tickers_df = pd.DataFrame()

# Read the JSON file into a DataFrame
df = pd.read_json("company_tickers.json", orient="index")
tickers = df["ticker"].tolist()
count = 0 
for ticker in tickers:
    count += 1 
    if count == 3:
        break
    ticker_data = process_ticker_10k_data(ticker)
    if ticker_data:
        # Convert the list of dictionaries to a DataFrame
        ticker_df = pd.DataFrame(ticker_data)
        print(f"Processed {len(ticker_df)} 10-K filings for {ticker}")
        # Append to the master DataFrame
        all_tickers_df = pd.concat([all_tickers_df, ticker_df], ignore_index=True)


# Once all tickers are processed, export to CSV
all_tickers_df.to_csv("tickers_10k_data.csv", index=False)

Collecting Tickers: 100%|██████████| 4/4 [00:00<00:00, 1640.00ticker/s]


Files are ready for AMZN
Files are ready for AAPL
Files are ready for GOOGL
Files are ready for MSFT
Processed 22 10-K filings for AAPL
Error occurred while downloading filing for accession number {}: {} 0001032210-00-001961 404 Client Error: Not Found for url: https://www.sec.gov/Archives/edgar/data/789019/000103221000001961/0001.txt


Collecting Tickers: 100%|██████████| 4/4 [00:00<00:00, 1586.05ticker/s]


Files are ready for AMZN
Files are ready for AAPL
Files are ready for GOOGL
Files are ready for MSFT
Processed 22 10-K filings for MSFT
