In [4]:
import concurrent.futures
from datetime import datetime
from utils.get_ticker_10k_filings import get_ticker_10k_filings
from utils.collect_ticker_files import collect_ticker_files
from utils.delete_txt_files import delete_txt_files
from utils.parse_html_file_mda import parse_html_file_mda
import os
import pandas as pd


# New Function: Process a single HTML file
def process_html_file(html_file, ticker):
    if html_file.endswith(".html"):
        path_parts = html_file.split("/")
        cik_year_acc = path_parts[4].split("-")

        if len(cik_year_acc) < 3:
            print(f"Skipping file with unexpected format: {html_file}")
            return None

        CIK = cik_year_acc[0]
        # Convert the two-digit year to four digits
        two_digit_year = cik_year_acc[1]
        Year = (
            "19" + two_digit_year if int(two_digit_year) > 50 else "20" + two_digit_year
        )
        AccessionNumber = cik_year_acc[2]

        try:
            parsed_data = parse_html_file_mda(html_file)
            filing_dict = {
                "ticker": ticker,
                "year": int(Year),
                "mda_section": parsed_data,
                "processed_timestamp": datetime.now(),
            }
            return filing_dict
        except Exception as e:
            print(f"Could not parse {html_file} due to error: {e}")
            return None


# Modified Function: Process ticker 10-K data with parallel processing
def process_ticker_10k_data(ticker):
    try:
        get_ticker_10k_filings(ticker)
    except Exception as e:
        print(f"Error occurred while downloading filings for {ticker}: {e}")
        return {}

    ticker_files_dict = collect_ticker_files()
    delete_txt_files(ticker_files_dict.get(ticker, []))

    # Parallel processing of HTML files
    html_files = ticker_files_dict.get(ticker, [])
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(lambda file: process_html_file(file, ticker), html_files)
        all_parsed_data = {
            result["processed_timestamp"]: result
            for result in results
            if result is not None
        }

    # Create a list of all parsed data dictionaries
    all_parsed_data_list = list(all_parsed_data.values())

    return all_parsed_data_list


# Function to process a single ticker (to be used in parallel processing)
def process_single_ticker(ticker):
    ticker_data = process_ticker_10k_data(ticker)
    if ticker_data:
        # Convert the list of dictionaries to a DataFrame
        ticker_df = pd.DataFrame(ticker_data)
        print(f"Processed {len(ticker_df)} 10-K filings for {ticker}")
        return ticker_df
    else:
        return pd.DataFrame()


# Read the JSON file into a DataFrame
df = pd.read_json("company_tickers.json", orient="index")
tickers = df["ticker"].tolist()

# Parallel processing of tickers
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(process_single_ticker, ticker) for ticker in tickers[:3]
    ]  # Adjust as needed
    all_tickers_data_frames = [
        future.result() for future in concurrent.futures.as_completed(futures)
    ]

# Combine all dataframes into one
all_tickers_df = pd.concat(all_tickers_data_frames, ignore_index=True)

# Once all tickers are processed, export to CSV
all_tickers_df.to_csv("tickers_10k_data.csv", index=False)

Processed 2 10-K filings for MSFT

Collecting Tickers: 100%|██████████| 4/4 [00:00<00:00, 997.28ticker/s]

Files are ready for AMZN
Files are ready for AAPL
Files are ready for GOOGL
Files are ready for MSFT





Processed 8 10-K filings for GOOGL
Error occurred while downloading filing for accession number {}: {} 0001032210-00-001961 404 Client Error: Not Found for url: https://www.sec.gov/Archives/edgar/data/789019/000103221000001961/0001.txt


Collecting Tickers: 100%|██████████| 4/4 [00:00<00:00, 1206.65ticker/s]


Files are ready for AMZN
Files are ready for AAPL
Files are ready for GOOGL
Files are ready for MSFT
Processed 22 10-K filings for AAPL


Collecting Tickers: 100%|██████████| 4/4 [00:00<00:00, 1373.61ticker/s]

Files are ready for AMZN
Files are ready for AAPL
Files are ready for GOOGL
Files are ready for MSFT





Processed 22 10-K filings for MSFT


In [1]:
import pandas as pd

df = pd.read_json("company_tickers.json", orient="index")

In [4]:
tickers_batch = df.iloc[0:2]
tickers_batch

Unnamed: 0,cik_str,ticker,title
0,320193,AAPL,Apple Inc.
1,789019,MSFT,MICROSOFT CORP


In [9]:
for index, row in tickers_batch.iterrows():
    # print("This is index: ", index) 
    # print("This is row: ", row)
    print(type(row))
    print(row["ticker"])

<class 'pandas.core.series.Series'>
AAPL
<class 'pandas.core.series.Series'>
MSFT
