In [1]:
import os
import asyncio
import logging
import warnings
import json
from datetime import datetime
from glob import glob
from typing import List, Dict, Optional
import datamule as dm
import pandas as pd
from selectolax.parser import HTMLParser
from config import CONFIG  # Import the config dictionary

# Suppress warnings
warnings.filterwarnings("ignore")

# Get today's date
today = datetime.today()

# Format the date in 'YYYY-MM-DD' format
today_date = today.strftime('%Y-%m-%d')

tickers = CONFIG['TICKERS']
start = CONFIG['START_DATE']
end = CONFIG['END_DATE']
base_dir = CONFIG['BASE_DIR']

class SECDownloader:
    def __init__(self):
        self.downloader = dm.Downloader()
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
    def set_user_agent(self, user_agent: str) -> None:
        """Set SEC user agent information."""
        try:
            self.downloader.set_headers(user_agent)
        except Exception as e:
            self.logger.error(f"Failed to set user agent: {str(e)}")
            raise
            
    async def download_filings(self, ticker: str, start_date: str, end_date: str, output_dir: str) -> None:
        """Download SEC filings with proper error handling."""
        try:
            await self.downloader.download(
                ticker=ticker,
                form=['10-K', '10-Q', '8-K'],  # Specify forms explicitly
                date=(start_date, end_date),
                output_dir=output_dir,
                return_urls=False  # Ensure we're downloading files
            )
        except ValueError as e:
            self.logger.error(f"Value error downloading filings for {ticker}: {str(e)}")
        except Exception as e:
            self.logger.error(f"Error downloading filings for {ticker}: {str(e)}")

    async def download_concepts(self, ticker: str, output_dir: str) -> None:
        """Download company concepts data with proper error handling."""
        try:
            await self.downloader.download_company_concepts(
                ticker=ticker,
                output_dir=output_dir
            )
        except Exception as e:
            self.logger.error(f"Error downloading company concepts for {ticker}: {str(e)}")

    async def process_ticker(self, ticker: str, start: str, end: str, base_dir: str) -> None:
        """Process a single ticker's downloads."""
        try:
            # Create directory structure
            ticker_dir = os.path.join(base_dir, ticker)
            filings_dir = os.path.join(ticker_dir, 'filings')
            concepts_dir = os.path.join(ticker_dir, 'company_concepts')
            
            os.makedirs(filings_dir, exist_ok=True)
            os.makedirs(concepts_dir, exist_ok=True)

            # Download both filings and concepts concurrently
            await asyncio.gather(
                self.download_filings(ticker, start, end, filings_dir),
                self.download_concepts(ticker, concepts_dir)
            )
            
        except Exception as e:
            self.logger.error(f"Failed to process ticker {ticker}: {str(e)}")

    async def download_all_data(self, tickers: List[str], start: str, end: str, base_dir: str = 'sec_data') -> None:
        """Download all SEC data for given tickers."""
        tasks = []
        for ticker in tickers:
            task = self.process_ticker(ticker, start, end, base_dir)
            tasks.append(task)
        
        await asyncio.gather(*tasks)

# Initialize downloader
sec_downloader = SECDownloader()

try:
    # Set user agent (required by SEC)
    sec_downloader.set_user_agent("Your Name your@email.com")
    
    # Create and run async download task
    async def run_downloads():
        await sec_downloader.download_all_data(tickers, start, end, base_dir)
        
    asyncio.run(run_downloads())
    
except KeyboardInterrupt:
    sec_downloader.logger.warning("\nDownload interrupted by user")
except Exception as e:
    sec_downloader.logger.error(f"Fatal error: {str(e)}")
finally:
    sec_downloader.logger.info("Download process completed")

Downloading files:   0%|                                                                            | 0/1 [00:00<?, ?it/s]




Downloading files:   0%|                                                                            | 0/1 [00:00<?, ?it/s]

[A




Downloading files: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.65it/s]

[A

Downloading files: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.65it/s]


ERROR:__main__:Error downloading company concepts for REGN: object NoneType can't be used in 'await' expression



Successfully downloaded 1 out of 1 URLs





Fetching URLs:   0%|                                                                               | 0/10 [00:00<?, ?it/s]

[A





Fetching URLs:   0%|                                                                               | 0/10 [00:00<?, ?it/s]

[A[A

Downloading files: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]

Downloading files: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]





Successfully downloaded 1 out of 1 URLs





Fetching URLs:  10%|███████                                                                | 1/10 [00:00<00:03,  2.39it/s]

[A




Fetching URLs:  30%|█████████████████████▎                                                 | 3/10 [00:00<00:01,  6.33it/s]

[A





Fetching URLs:  10%|███████                                                                | 1/10 [00:00<00:05,  1.60it/s]

[A[A




Fetching URLs:  90%|███████████████████████████████████████████████████████████████▉       | 9/10 [00:00<00:00, 16.98it/s]

[A

Fetching URLs: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 12.00it/s]




https://efts.sec.gov/LATEST/search-index?ciks=0000875320&forms=10-K%2C10-Q%2C8-K&startdt=2024-01-01&enddt=2024-06-01
Total filings: 7


Downloading files:   0%|                                                                            | 0/7 [00:00<?, ?it/s]





Fetching URLs:  20%|██████████████▏                                                        | 2/10 [00:00<00:02,  2.67it/s]

[A[A





Fetching URLs:  50%|███████████████████████████████████▌                                   | 5/10 [00:00<00:00,  7.03it/s]

[A[A

Downloading files:  14%|█████████▋                                                          | 1/7 [00:00<00:01,  3.99it/s]





Fetching URLs:  70%|█████████████████████████████████████████████████▋                     | 7/10 [00:01<00:00,  8.69it/s]

[A[A

Downloading files:  86%|██████████████████████████████████████████████████████████▎         | 6/7 [00:00<00:00, 20.68it/s]

Downloading files: 100%|████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 14.44it/s]








Fetching URLs:  90%|███████████████████████████████████████████████████████████████▉       | 9/10 [00:01<00:00,  8.00it/s]

[A[A


Successfully downloaded 7 out of 7 URLs


Fetching URLs: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.41it/s]




https://efts.sec.gov/LATEST/search-index?ciks=0000872589&forms=10-K%2C10-Q%2C8-K&startdt=2024-01-01&enddt=2024-06-01
Total filings: 6


Downloading files:   0%|                                                                            | 0/6 [00:00<?, ?it/s]

Downloading files:  17%|███████████▎                                                        | 1/6 [00:00<00:01,  3.50it/s]

Downloading files:  83%|████████████████████████████████████████████████████████▋           | 5/6 [00:00<00:00, 11.79it/s]

Downloading files: 100%|████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 10.28it/s]


ERROR:__main__:Error downloading filings for REGN: object NoneType can't be used in 'await' expression


ERROR:__main__:Error downloading company concepts for VRTX: object NoneType can't be used in 'await' expression


ERROR:__main__:Error downloading filings for VRTX: object NoneType can't be used in 'await' expression


INFO:__main__:Download process completed



Successfully downloaded 6 out of 6 URLs
