In [1]:
import os
import json
import re
import pandas as pd
from bs4 import BeautifulSoup
from supabase import create_client
from shutil import rmtree
from dotenv import load_dotenv

from utils.get_ticker_10k_filings import get_ticker_10k_filings
from utils.collect_ticker_files import collect_ticker_files
from utils.new_10k_reports_to_supabase import new_10k_reports_to_supabase
from utils.find_general_section import find_general_section
from utils.delete_txt_files import delete_txt_files
from utils.parse_html_file import parse_html_file

# Supabase API keys
load_dotenv()
SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_KEY = os.environ["SUPABASE_KEY"]
Client = create_client(SUPABASE_URL, SUPABASE_KEY)


def process_ticker_10k_data(ticker):
    # Download 10-K filings
    get_ticker_10k_filings(ticker)
    ticker_files_dict = collect_ticker_files()

    # Delete .txt files to save space
    delete_txt_files(ticker_files_dict.get(ticker, []))

    # Initialize a dictionary to hold all parsed data
    all_parsed_data = {}

    # Loop through each HTML file to parse and store the data
    for html_file in ticker_files_dict.get(ticker, []):
        if html_file.endswith(".html"):
            path_parts = html_file.split("/")
            cik_year_acc = path_parts[4].split("-")

            if len(cik_year_acc) < 3:
                print(f"Skipping file with unexpected format: {html_file}")
                continue

            CIK, Year, AccessionNumber = cik_year_acc

            try:
                parsed_data = parse_html_file(html_file)
            except Exception as e:
                print(f"Could not parse {html_file} due to error: {e}")
                continue

            try:
                filing_dict = {
                    "ticker": ticker,
                    "cik": CIK,
                    "year": int(Year),
                    "accession_number": AccessionNumber,
                    "parsed_data": json.dumps(parsed_data),
                }
            except ValueError:
                print(f"Skipping file with invalid year format in {html_file}")
                continue

            all_parsed_data[AccessionNumber] = filing_dict

    # Create a list of all parsed data dictionaries
    all_parsed_data_list = list(all_parsed_data.values())

    # Insert parsed data into Supabase
    new_10k_reports_to_supabase(all_parsed_data_list, Client)

    # Clear the data folder after processing
    rmtree("data")

    return all_parsed_data


# Example usage
# Replace with your loop over tickers
all_tickers_data = {}
tickers = ["AAPL", "GOOG"]  # Add your list of tickers here

for ticker in tickers:
    all_tickers_data[ticker] = process_ticker_10k_data(ticker)

2023-09-12 15:12:57,401:INFO - Initializing default bucket(InMemoryBucket) with rates: [limit=10/1000]
2023-09-12 15:12:57,401:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15:12:59,407:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 4 items
2023-09-12 15:13:01,412:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 8 items
2023-09-12 15:13:03,418:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 8 items
2023-09-12 15:13:05,423:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 9 items
2023-09-12 15:13:07,425:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 10 items
2023-09-12 15:13:09,430:INFO - (sync)leaking bucke

Files are ready for AAPL


2023-09-12 15:13:11,442:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 7 items
2023-09-12 15:13:13,558:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15:13:15,657:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15:13:15,679:INFO - HTTP Request: GET https://brezxtvmghfjdcbpdpaa.supabase.co/rest/v1/reports_10k?select=%2A "HTTP/1.1 200 OK"
2023-09-12 15:13:15,870:INFO - HTTP Request: POST https://brezxtvmghfjdcbpdpaa.supabase.co/rest/v1/reports_10k "HTTP/1.1 201 Created"
2023-09-12 15:13:17,662:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 4 items
2023-09-12 15:13:19,668:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 7 items
Collecting Tickers: 10

Files are ready for GOOG


2023-09-12 15:13:21,676:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 8 items
2023-09-12 15:13:23,575:INFO - HTTP Request: GET https://brezxtvmghfjdcbpdpaa.supabase.co/rest/v1/reports_10k?select=%2A "HTTP/1.1 200 OK"
2023-09-12 15:13:23,674:INFO - HTTP Request: POST https://brezxtvmghfjdcbpdpaa.supabase.co/rest/v1/reports_10k "HTTP/1.1 201 Created"


2023-09-12 15:13:23,717:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15:13:25,722:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15:13:27,728:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15:13:29,734:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15:13:31,739:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15:13:33,745:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15:13:35,746:INFO - (sync)leaking bucket: <pyrate_limiter.buckets.in_memory_bucket.InMemoryBucket object at 0x12647e850>, 0 items
2023-09-12 15

In [4]:
import pandas as pd 
df = pd.read_json('company_tickers.json', orient='index')
all_tickers_data = {}
tickers = df['ticker'].tolist()

for ticker in tickers:
    all_tickers_data[ticker] = process_ticker_10k_data(ticker)


10909

In [6]:
import pandas as pd 
df = pd.read_json('company_tickers.json', orient='index')
print(len(df))

10909


In [7]:
df.head()

Unnamed: 0,cik_str,ticker,title
0,320193,AAPL,Apple Inc.
1,789019,MSFT,MICROSOFT CORP
2,1652044,GOOGL,Alphabet Inc.
3,1018724,AMZN,AMAZON COM INC
4,1045810,NVDA,NVIDIA CORP


In [9]:
df['cik_str'].nunique()

8350