#### Is there new filing?
#### Compare 10Q vs prior quarter
#### Compare 10K vs 10K prior quarter
#### portions of 10K similar to 3Q 10Q, compare K vs Q
#### https://www.intelligize.com/wp-content/uploads/2023/04/Using-the-Redline-Tool-23.pdf
#### https://capedge.com/transcript/1727196/2024Q2/SRRK
#### multiple newsdrops in the morning --> 3.05 Pacific time , 4.05 , 5.05, 6.15 AM ; 1.05PM , 1.20PM , 1.45PM , 2.45PM , 4PM --> conference presentation or earnings call 

1) Check SEC website , conference presentation or earnings call 
2) Check CapEdge for new transcripts filing

In [2]:
import os
import asyncio
import logging
import warnings
import json
from datetime import datetime
from glob import glob
from typing import List, Dict, Optional
import datamule as dm
import pandas as pd
from selectolax.parser import HTMLParser

# Suppress warnings
warnings.filterwarnings("ignore")

# Get today's date
today = datetime.today()

# Format the date in 'YYYY-MM-DD' format
today_date = today.strftime('%Y-%m-%d')


In [3]:
class SECDownloader:
    def __init__(self):
        self.downloader = dm.Downloader()
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
    def set_user_agent(self, user_agent: str) -> None:
        """Set SEC user agent information."""
        try:
            self.downloader.set_headers(user_agent)
        except Exception as e:
            self.logger.error(f"Failed to set user agent: {str(e)}")
            raise
            
    async def download_filings(self, ticker: str, start_date: str, end_date: str, output_dir: str) -> None:
        """Download SEC filings with proper error handling."""
        try:
            await self.downloader.download(
                ticker=ticker,
                form=['10-K', '10-Q', '8-K'],  # Specify forms explicitly
                date=(start_date, end_date),
                output_dir=output_dir,
                return_urls=False  # Ensure we're downloading files
            )
        except ValueError as e:
            self.logger.error(f"Value error downloading filings for {ticker}: {str(e)}")
        except Exception as e:
            self.logger.error(f"Error downloading filings for {ticker}: {str(e)}")

    async def download_concepts(self, ticker: str, output_dir: str) -> None:
        """Download company concepts data with proper error handling."""
        try:
            await self.downloader.download_company_concepts(
                ticker=ticker,
                output_dir=output_dir
            )
        except Exception as e:
            self.logger.error(f"Error downloading company concepts for {ticker}: {str(e)}")

    async def process_ticker(self, ticker: str, start: str, end: str, base_dir: str) -> None:
        """Process a single ticker's downloads."""
        try:
            # Create directory structure
            ticker_dir = os.path.join(base_dir, ticker)
            filings_dir = os.path.join(ticker_dir, 'filings')
            concepts_dir = os.path.join(ticker_dir, 'company_concepts')
            
            os.makedirs(filings_dir, exist_ok=True)
            os.makedirs(concepts_dir, exist_ok=True)

            # Download both filings and concepts concurrently
            await asyncio.gather(
                self.download_filings(ticker, start, end, filings_dir),
                self.download_concepts(ticker, concepts_dir)
            )
            
        except Exception as e:
            self.logger.error(f"Failed to process ticker {ticker}: {str(e)}")

    async def download_all_data(self, tickers: List[str], start: str, end: str, base_dir: str = 'sec_data') -> None:
        """Download all SEC data for given tickers."""
        tasks = []
        for ticker in tickers:
            task = self.process_ticker(ticker, start, end, base_dir)
            tasks.append(task)
        
        await asyncio.gather(*tasks)

# Initialize parameters
tickers = ['AAPL', 'MSFT', 'GOOGL']
start = '2024-01-01'
end = '2024-06-01'#today_date
base_dir = 'sec_data'

# Initialize downloader
sec_downloader = SECDownloader()

try:
    # Set user agent (required by SEC)
    sec_downloader.set_user_agent("Your Name your@email.com")
    
    # Create and run async download task
    async def run_downloads():
        await sec_downloader.download_all_data(tickers, start, end, base_dir)
        
    asyncio.run(run_downloads())
    
except KeyboardInterrupt:
    sec_downloader.logger.warning("\nDownload interrupted by user")
except Exception as e:
    sec_downloader.logger.error(f"Fatal error: {str(e)}")
finally:
    sec_downloader.logger.info("Download process completed")

Downloading files:   0%|          | 0/1 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A
Downloading files: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]
ERROR:__main__:Error downloading company concepts for GOOGL: object NoneType can't be used in 'await' expression


[A[A


Successfully downloaded 1 out of 1 URLs


Downloading files: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


[A[A


Downloading files: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]



Successfully downloaded 1 out of 1 URLs

Successfully downloaded 1 out of 1 URLs






[A[A[A[A

[A[A


[A[A[A

[A[A


Fetching URLs: 100%|██████████| 10/10 [00:01<00:00,  8.71it/s]


https://efts.sec.gov/LATEST/search-index?ciks=0000789019&forms=10-K%2C10-Q%2C8-K&startdt=2024-01-01&enddt=2024-06-01
Total filings: 6


Downloading files:   0%|          | 0/6 [00:00<?, ?it/s]



[A[A[A[A


[A[A[A



[A[A[A[A



Downloading files:  17%|█▋        | 1/6 [00:00<00:02,  2.48it/s]


Downloading files:  67%|██████▋   | 4/6 [00:00<00:00,  9.28it/s]



[A[A[A[A


Downloading files: 100%|██████████| 6/6 [00:00<00:00,  7.22it/s]



[A[A[A


Successfully downloaded 6 out of 6 URLs






[A[A[A[A


Fetching URLs: 100%|██████████| 10/10 [00:02<00:00,  4.45it/s]


https://efts.sec.gov/LATEST/search-index?ciks=0001652044&forms=10-K%2C10-Q%2C8-K&startdt=2024-01-01&enddt=2024-06-01
Total filings: 5


Downloading files:   0%|          | 0/5 [00:00<?, ?it/s]



Fetching URLs: 100%|██████████| 10/10 [00:02<00:00,  4.26it/s]


https://efts.sec.gov/LATEST/search-index?ciks=0000320193&forms=10-K%2C10-Q%2C8-K&startdt=2024-01-01&enddt=2024-06-01
Total filings: 6


Downloading files: 100%|██████████| 5/5 [00:00<00:00, 10.04it/s]
ERROR:__main__:Error downloading filings for GOOGL: object NoneType can't be used in 'await' expression
ERROR:__main__:Error downloading company concepts for MSFT: object NoneType can't be used in 'await' expression
ERROR:__main__:Error downloading filings for MSFT: object NoneType can't be used in 'await' expression
ERROR:__main__:Error downloading company concepts for AAPL: object NoneType can't be used in 'await' expression
Downloading files: 100%|██████████| 6/6 [00:00<00:00, 14.37it/s]
ERROR:__main__:Error downloading filings for AAPL: object NoneType can't be used in 'await' expression
INFO:__main__:Download process completed



Successfully downloaded 5 out of 5 URLs

Successfully downloaded 6 out of 6 URLs


In [4]:
class SECFilingLoader:
    def __init__(self, base_dir: str = 'sec_data', concepts_dir: str = 'company_concepts'):
        self.base_dir = base_dir
        self.concepts_dir = concepts_dir  # Directory where Company Concepts are stored
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def extract_text_from_html(self, file_path: str) -> str:
        """Extract clean text from HTML filing using selectolax (faster than BeautifulSoup)."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                html_content = f.read()
            
            # Parse HTML using selectolax
            tree = HTMLParser(html_content)
            
            # Remove script and style elements
            for tag in tree.css('script'):
                tag.decompose()
            for tag in tree.css('style'):
                tag.decompose()
            
            # Extract text with newlines between elements
            if tree.body:
                text = tree.body.text(separator='\n')
                # Clean up extra whitespace
                text = ' '.join(text.split())
                return text
            return ""
            
        except Exception as e:
            self.logger.error(f"Error extracting text from {file_path}: {str(e)}")
            return ""

    def classify_filing_type(self, content: str, filename: str) -> str:
        """Classify the filing type as 10-Q or 10-K based on the content or filename."""
        if '10-K' in content or '10-K' in filename:
            return '10-K'
        elif '10-Q' in content or '10-Q' in filename:
            return '10-Q'
        else:
            return 'Other'  # You could also use 'Unknown' if you prefer

    def load_filings(self, ticker: str) -> pd.DataFrame:
        """Load SEC filings data with full text content for a specific ticker."""
        try:
            filings_path = os.path.join(self.base_dir, ticker, 'filings')
            filing_files = glob(os.path.join(filings_path, '*.htm'))
            
            if not filing_files:
                raise FileNotFoundError(f"No filings found for {ticker}")
            
            filings_data = []
            for file_path in filing_files:
                try:
                    filename = os.path.basename(file_path)
                    accession_num, filing_date = filename.replace('.htm', '').split('_')
                    
                    # Read and clean the file content
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        
                    # Parse HTML and extract clean text
                    tree = HTMLParser(content)
                    
                    # Remove script and style elements
                    for tag in tree.css('script'):
                        tag.decompose()
                    for tag in tree.css('style'):
                        tag.decompose()
                    
                    # Extract clean text
                    if tree.body:
                        clean_text = tree.body.text(separator='\n')
                        # Clean up extra whitespace
                        clean_text = ' '.join(clean_text.split())
                    else:
                        clean_text = ""
                    
                    # Classify filing type
                    #filing_type = self.classify_filing_type(clean_text, filename)
                    
                    filing_info = {
                        'ticker': ticker,
                        'accession_number': accession_num,
                        'filing_date': filing_date,
                        'file_path': file_path,
                        'file_size': os.path.getsize(file_path),
                        'content': clean_text,
                        'content_length': len(clean_text),
                        #'filing_type': filing_type  # Add filing type column
                    }
                    filings_data.append(filing_info)
                    
                except Exception as e:
                    self.logger.error(f"Error processing file {file_path}: {str(e)}")
                    continue
            
            # Convert to DataFrame
            df = pd.DataFrame(filings_data)
            
            # Convert dates and format columns
            df['filing_date'] = pd.to_datetime(df['filing_date'])
            df['file_size'] = df['file_size'] / 1024  # Convert to KB
            
            # Sort by filing date
            df = df.sort_values('filing_date', ascending=False)
            
            return df
            
        except Exception as e:
            self.logger.error(f"Error loading filings for {ticker}: {str(e)}")
            return pd.DataFrame()

    def load_company_concepts(self, ticker: str) -> pd.DataFrame:
        """Load Company Concepts for a specific ticker."""
        try:
            concepts_file_path = os.path.join(self.concepts_dir, f'{ticker}_concepts.csv')
            
            if not os.path.exists(concepts_file_path):
                raise FileNotFoundError(f"No company concepts found for {ticker}")
            
            df = pd.read_csv(concepts_file_path)
            df['ticker'] = ticker  # Add ticker column for reference
            return df
            
        except Exception as e:
            self.logger.error(f"Error loading company concepts for {ticker}: {str(e)}")
            return pd.DataFrame()

    def load_all_filings(self, tickers: Optional[List[str]] = None) -> Dict[str, pd.DataFrame]:
        """Load filings and company concepts data with text for multiple tickers."""
        all_data = []  # List to hold the DataFrames for all tickers
        if tickers is None:
            # Get all tickers from the base directory
            tickers = [d for d in os.listdir(self.base_dir) 
                      if os.path.isdir(os.path.join(self.base_dir, d))]
        
        for ticker in tickers:
            self.logger.info(f"Loading filings and company concepts for {ticker}...")
            
            # Load SEC filings
            filings_df = self.load_filings(ticker)
            
            # Load Company Concepts
            concepts_df = self.load_company_concepts(ticker)
            
            # Merge both dataframes if they are not empty
            if not filings_df.empty and not concepts_df.empty:
                merged_df = pd.merge(filings_df, concepts_df, on='ticker', how='left')
                all_data.append(merged_df)
                self.logger.info(f"Loaded {len(filings_df)} filings and concepts for {ticker}")
            elif not filings_df.empty:
                all_data.append(filings_df)
                self.logger.info(f"Loaded {len(filings_df)} filings for {ticker}")
            elif not concepts_df.empty:
                all_data.append(concepts_df)
                self.logger.info(f"Loaded company concepts for {ticker}")
        
        # Concatenate all dataframes into a single DataFrame
        all_data_df = pd.concat(all_data, ignore_index=True)
        
        # Return the merged data for all tickers as a single DataFrame
        return all_data_df

# Example usage
loader = SECFilingLoader('sec_data', 'company_concepts')

# Load data for specific tickers
tickers = ['AAPL', 'MSFT', 'GOOGL']

# Load all filings and company concepts and get one large DataFrame
all_data_df = loader.load_all_filings(tickers)

required_cols = ["ticker","filing_date","content"]#["ticker","filing_date","filing_type","content"]
all_data_df_min = all_data_df[required_cols]
all_data_df_min

INFO:__main__:Loading filings and company concepts for AAPL...
ERROR:__main__:Error loading company concepts for AAPL: No company concepts found for AAPL
INFO:__main__:Loaded 6 filings for AAPL
INFO:__main__:Loading filings and company concepts for MSFT...
ERROR:__main__:Error loading company concepts for MSFT: No company concepts found for MSFT
INFO:__main__:Loaded 6 filings for MSFT
INFO:__main__:Loading filings and company concepts for GOOGL...
ERROR:__main__:Error loading company concepts for GOOGL: No company concepts found for GOOGL
INFO:__main__:Loaded 5 filings for GOOGL


Unnamed: 0,ticker,filing_date,content
0,AAPL,2024-05-03,false 2024 Q2 0000320193 --09-28 P1Y P1Y P1Y P...
1,AAPL,2024-05-03,true true true true true true true true false ...
2,AAPL,2024-05-02,false 0000320193 0000320193 2024-05-02 2024-05...
3,AAPL,2024-02-28,true true true true true true true true NASDAQ...
4,AAPL,2024-02-02,false 2024 Q1 0000320193 --09-28 P1Y P1Y P1Y h...
5,AAPL,2024-02-01,false 0000320193 0000320193 2024-02-01 2024-02...
6,MSFT,2024-04-25,0000789019 false 0000789019 2024-04-25 2024-04...
7,MSFT,2024-04-25,0000789019 Q3 --06-30 false http://fasb.org/us...
8,MSFT,2024-03-08,0000789019 0000789019 2024-01-17 2024-01-17 00...
9,MSFT,2024-01-30,false 0000789019 0000789019 msft:NotesThreePoi...


In [5]:
# # import os
# # import logging
# # import pandas as pd
# # from glob import glob
# # from selectolax.parser import HTMLParser
# # from typing import Optional, List, Dict

# # class SECFilingLoader:
# #     def __init__(self, base_dir: str = 'sec_data', concepts_dir: str = 'company_concepts'):
# #         self.base_dir = base_dir
# #         self.concepts_dir = concepts_dir  # Directory where Company Concepts are stored
# #         logging.basicConfig(level=logging.INFO)
# #         self.logger = logging.getLogger(__name__)

# #     def extract_text_from_html(self, file_path: str) -> str:
# #         """Extract clean text from HTML filing using selectolax (faster than BeautifulSoup)."""
# #         try:
# #             with open(file_path, 'r', encoding='utf-8') as f:
# #                 html_content = f.read()
            
# #             # Parse HTML using selectolax
# #             tree = HTMLParser(html_content)
            
# #             # Remove script and style elements
# #             for tag in tree.css('script'):
# #                 tag.decompose()
# #             for tag in tree.css('style'):
# #                 tag.decompose()
            
# #             # Extract text with newlines between elements
# #             if tree.body:
# #                 text = tree.body.text(separator='\n')
# #                 # Clean up extra whitespace
# #                 text = ' '.join(text.split())
# #                 return text
# #             return ""
            
# #         except Exception as e:
# #             self.logger.error(f"Error extracting text from {file_path}: {str(e)}")
# #             return ""

# #     def classify_filing_type(self, content: str, filename: str) -> str:
# #         """Classify the filing type as 10-Q or 10-K based on the content or filename."""
# #         if '10-K' in content or '10-K' in filename:
# #             return '10-K'
# #         elif '10-Q' in content or '10-Q' in filename:
# #             return '10-Q'
# #         else:
# #             return 'Other'  # You could also use 'Unknown' if you prefer

# #     def load_filings(self, ticker: str) -> pd.DataFrame:
# #         """Load SEC filings data with full text content for a specific ticker."""
# #         try:
# #             filings_path = os.path.join(self.base_dir, ticker, 'filings')
# #             filing_files = glob(os.path.join(filings_path, '*.htm'))
            
# #             if not filing_files:
# #                 raise FileNotFoundError(f"No filings found for {ticker}")
            
# #             filings_data = []
# #             for file_path in filing_files:
# #                 try:
# #                     filename = os.path.basename(file_path)
# #                     accession_num, filing_date = filename.replace('.htm', '').split('_')
                    
# #                     # Read and clean the file content
# #                     with open(file_path, 'r', encoding='utf-8') as f:
# #                         content = f.read()
                        
# #                     # Parse HTML and extract clean text
# #                     tree = HTMLParser(content)
                    
# #                     # Remove script and style elements
# #                     for tag in tree.css('script'):
# #                         tag.decompose()
# #                     for tag in tree.css('style'):
# #                         tag.decompose()
                    
# #                     # Extract clean text
# #                     if tree.body:
# #                         clean_text = tree.body.text(separator='\n')
# #                         # Clean up extra whitespace
# #                         clean_text = ' '.join(clean_text.split())
# #                     else:
# #                         clean_text = ""
                    
# #                     # Classify filing type
# #                     filing_type = self.classify_filing_type(clean_text, filename)
                    
# #                     filing_info = {
# #                         'ticker': ticker,
# #                         'accession_number': accession_num,
# #                         'filing_date': filing_date,
# #                         'file_path': file_path,
# #                         'file_size': os.path.getsize(file_path),
# #                         'content': clean_text,
# #                         'content_length': len(clean_text),
# #                         'filing_type': filing_type  # Add filing type column
# #                     }
# #                     filings_data.append(filing_info)
                    
# #                 except Exception as e:
# #                     self.logger.error(f"Error processing file {file_path}: {str(e)}")
# #                     continue
            
# #             # Convert to DataFrame
# #             df = pd.DataFrame(filings_data)
            
# #             # Convert dates and format columns
# #             df['filing_date'] = pd.to_datetime(df['filing_date'])
# #             df['file_size'] = df['file_size'] / 1024  # Convert to KB
            
# #             # Sort by filing date
# #             df = df.sort_values('filing_date', ascending=False)
            
# #             return df
            
# #         except Exception as e:
# #             self.logger.error(f"Error loading filings for {ticker}: {str(e)}")
# #             return pd.DataFrame()

# #     def load_company_concepts(self, ticker: str) -> pd.DataFrame:
# #         """Load Company Concepts for a specific ticker."""
# #         try:
# #             concepts_file_path = os.path.join(self.concepts_dir, f'{ticker}_concepts.csv')
            
# #             if not os.path.exists(concepts_file_path):
# #                 raise FileNotFoundError(f"No company concepts found for {ticker}")
            
# #             df = pd.read_csv(concepts_file_path)
# #             df['ticker'] = ticker  # Add ticker column for reference
# #             return df
            
# #         except Exception as e:
# #             self.logger.error(f"Error loading company concepts for {ticker}: {str(e)}")
# #             return pd.DataFrame()

# #     def load_all_filings(self, tickers: Optional[List[str]] = None) -> Dict[str, pd.DataFrame]:
# #         """Load filings and company concepts data with text for multiple tickers."""
# #         all_data = []  # List to hold the DataFrames for all tickers
# #         if tickers is None:
# #             # Get all tickers from the base directory
# #             tickers = [d for d in os.listdir(self.base_dir) 
# #                       if os.path.isdir(os.path.join(self.base_dir, d))]
        
# #         for ticker in tickers:
# #             self.logger.info(f"Loading filings and company concepts for {ticker}...")
            
# #             # Load SEC filings
# #             filings_df = self.load_filings(ticker)
            
# #             # Load Company Concepts
# #             concepts_df = self.load_company_concepts(ticker)
            
# #             # Merge both dataframes if they are not empty
# #             if not filings_df.empty and not concepts_df.empty:
# #                 merged_df = pd.merge(filings_df, concepts_df, on='ticker', how='left')
# #                 all_data.append(merged_df)
# #                 self.logger.info(f"Loaded {len(filings_df)} filings and concepts for {ticker}")
# #             elif not filings_df.empty:
# #                 all_data.append(filings_df)
# #                 self.logger.info(f"Loaded {len(filings_df)} filings for {ticker}")
# #             elif not concepts_df.empty:
# #                 all_data.append(concepts_df)
# #                 self.logger.info(f"Loaded company concepts for {ticker}")
        
# #         # Concatenate all dataframes into a single DataFrame
# #         all_data_df = pd.concat(all_data, ignore_index=True)
        
# #         # Return the merged data for all tickers as a single DataFrame
# #         return all_data_df

# # # Example usage
# # loader = SECFilingLoader('sec_data', 'company_concepts')

# # # Load data for specific tickers
# # tickers = ['AAPL', 'MSFT', 'GOOGL']

# # # Load all filings and company concepts and get one large DataFrame
# # all_data_df = loader.load_all_filings(tickers)

# # # Example analysis
# # print(f"\nTotal rows in all_data_df: {len(all_data_df)}")
# # print(f"Average text length: {all_data_df['content_length'].mean():.0f} characters")  # Use 'content_length' here
# # print(f"Date range: {all_data_df['filing_date'].min()} to {all_data_df['filing_date'].max()}")

# # # Optionally, preview the first few rows
# # print("\nFirst few rows of all_data_df:")
# # print(all_data_df[['ticker', 'accession_number', 'filing_date', 'filing_type', 'content_length']].head())

# ############################################
# import os
# import logging
# import pandas as pd
# import json
# from glob import glob
# from selectolax.parser import HTMLParser
# from typing import Optional, List, Dict

# class SECFilingLoader:
#     def __init__(self, base_dir: str = 'sec_data', concepts_dir: str = 'company_concepts'):
#         self.base_dir = base_dir
#         self.concepts_dir = concepts_dir  # Directory where Company Concepts are stored
#         logging.basicConfig(level=logging.INFO)
#         self.logger = logging.getLogger(__name__)

#     def extract_text_from_html(self, file_path: str) -> str:
#         """Extract clean text from HTML filing using selectolax (faster than BeautifulSoup)."""
#         try:
#             with open(file_path, 'r', encoding='utf-8') as f:
#                 html_content = f.read()
            
#             # Parse HTML using selectolax
#             tree = HTMLParser(html_content)
            
#             # Remove script and style elements
#             for tag in tree.css('script'):
#                 tag.decompose()
#             for tag in tree.css('style'):
#                 tag.decompose()
            
#             # Extract text with newlines between elements
#             if tree.body:
#                 text = tree.body.text(separator='\n')
#                 # Clean up extra whitespace
#                 text = ' '.join(text.split())
#                 return text
#             return ""
            
#         except Exception as e:
#             self.logger.error(f"Error extracting text from {file_path}: {str(e)}")
#             return ""

#     def classify_filing_type(self, content: str, filename: str) -> str:
#         """Classify the filing type as 10-Q or 10-K based on the content or filename."""
#         if '10-K' in content or '10-K' in filename:
#             return '10-K'
#         elif '10-Q' in content or '10-Q' in filename:
#             return '10-Q'
#         else:
#             return 'Other'  # You could also use 'Unknown' if you prefer

#     def load_filings(self, ticker: str) -> pd.DataFrame:
#         """Load SEC filings data with full text content for a specific ticker."""
#         try:
#             filings_path = os.path.join(self.base_dir, ticker, 'filings')
#             filing_files = glob(os.path.join(filings_path, '*.htm'))
            
#             if not filing_files:
#                 raise FileNotFoundError(f"No filings found for {ticker}")
            
#             filings_data = []
#             for file_path in filing_files:
#                 try:
#                     filename = os.path.basename(file_path)
#                     accession_num, filing_date = filename.replace('.htm', '').split('_')
                    
#                     # Read and clean the file content
#                     with open(file_path, 'r', encoding='utf-8') as f:
#                         content = f.read()
                        
#                     # Parse HTML and extract clean text
#                     tree = HTMLParser(content)
                    
#                     # Remove script and style elements
#                     for tag in tree.css('script'):
#                         tag.decompose()
#                     for tag in tree.css('style'):
#                         tag.decompose()
                    
#                     # Extract clean text
#                     if tree.body:
#                         clean_text = tree.body.text(separator='\n')
#                         # Clean up extra whitespace
#                         clean_text = ' '.join(clean_text.split())
#                     else:
#                         clean_text = ""
                    
#                     # Classify filing type
#                     filing_type = self.classify_filing_type(clean_text, filename)
                    
#                     filing_info = {
#                         'ticker': ticker,
#                         'accession_number': accession_num,
#                         'filing_date': filing_date,
#                         'file_path': file_path,
#                         'file_size': os.path.getsize(file_path),
#                         'content': clean_text,
#                         'content_length': len(clean_text),
#                         'filing_type': filing_type  # Add filing type column
#                     }
#                     filings_data.append(filing_info)
                    
#                 except Exception as e:
#                     self.logger.error(f"Error processing file {file_path}: {str(e)}")
#                     continue
            
#             # Convert to DataFrame
#             df = pd.DataFrame(filings_data)
            
#             # Convert dates and format columns
#             df['filing_date'] = pd.to_datetime(df['filing_date'])
#             df['file_size'] = df['file_size'] / 1024  # Convert to KB
            
#             # Sort by filing date
#             df = df.sort_values('filing_date', ascending=False)
            
#             return df
#         except Exception as e:
#             self.logger.error(f"Error loading filings for {ticker}: {str(e)}")
#             return pd.DataFrame()

#     def load_company_concepts(self, ticker: str) -> pd.DataFrame:
#         """Load Company Concepts for a specific ticker."""
#         try:
#             # Define a mapping of tickers to their CIK (Central Index Key) for easier lookup
#             cik_mapping = {
#                 'AAPL': '0000320193',
#                 'MSFT': '0000789019',
#                 'GOOGL': '0001652044',
#                 # Add more mappings here...
#             }
            
#             # Look up the CIK for the given ticker
#             cik = cik_mapping.get(ticker, None)
#             if not cik:
#                 raise ValueError(f"No CIK found for ticker {ticker}")

#             # Construct the path to the company concepts JSON file for the given ticker
#             concepts_file_path = os.path.join(self.concepts_dir, f'CIK{cik}.json')
            
#             if not os.path.exists(concepts_file_path):
#                 raise FileNotFoundError(f"No company concepts file found for {ticker} at {concepts_file_path}")
            
#             # Load and parse the JSON file
#             with open(concepts_file_path, 'r', encoding='utf-8') as f:
#                 concepts_data = json.load(f)

#             # Extract relevant data from the JSON structure
#             company_info = {
#                 'cik': concepts_data.get('cik', ''),
#                 'entity_name': concepts_data.get('entityName', ''),
#                 'facts': concepts_data.get('facts', {})
#             }

#             # Flatten the 'facts' part of the data
#             facts = company_info['facts']
#             facts_flattened = {k: v for sub_dict in facts.values() for k, v in sub_dict.items()}

#             # Merge the general company information with the flattened facts
#             company_info.update(facts_flattened)

#             # Convert the collected information into a DataFrame
#             df = pd.DataFrame([company_info])

#             # Add ticker as a reference column
#             df['ticker'] = ticker
            
#             return df
#         except Exception as e:
#             self.logger.error(f"Error loading company concepts for {ticker}: {str(e)}")
#             return pd.DataFrame()

#     def load_all_data(self, tickers: Optional[List[str]] = None) -> Dict[str, pd.DataFrame]:
#         """Load filings and company concepts data for multiple tickers and return two separate DataFrames."""
#         all_filings_data = []  # List to hold filings data for all tickers
#         all_concepts_data = []  # List to hold company concepts data for all tickers

#         if tickers is None:
#             # Get all tickers from the base directory
#             tickers = [d for d in os.listdir(self.base_dir) if os.path.isdir(os.path.join(self.base_dir, d))]
        
#         for ticker in tickers:
#             self.logger.info(f"Loading filings and company concepts for {ticker}...")
            
#             # Load SEC filings
#             filings_df = self.load_filings(ticker)
#             if not filings_df.empty:
#                 all_filings_data.append(filings_df)
#                 self.logger.info(f"Loaded {len(filings_df)} filings for {ticker}")
            
#             # Load Company Concepts
#             concepts_df = self.load_company_concepts(ticker)
#             if not concepts_df.empty:
#                 all_concepts_data.append(concepts_df)
#                 self.logger.info(f"Loaded company concepts for {ticker}")
        
#         # Concatenate all dataframes into a single DataFrame for each category
#         filings_df = pd.concat(all_filings_data, ignore_index=True) if all_filings_data else pd.DataFrame()
#         company_concepts_df = pd.concat(all_concepts_data, ignore_index=True) if all_concepts_data else pd.DataFrame()
        
#         return filings_df, company_concepts_df

# # Example usage
# loader = SECFilingLoader('sec_data', 'company_concepts')

# # Load data for specific tickers
# tickers = ['AAPL', 'MSFT', 'GOOGL']

# # Load all filings and company concepts and get two separate DataFrames
# filings_df, company_concepts_df = loader.load_all_data(tickers)

# # Example analysis
# print(f"\nTotal rows in filings_df: {len(filings_df)}")
# print(f"Total rows in company_concepts_df: {len(company_concepts_df)}")

# # Optionally, preview the first few rows
# print("\nFirst few rows of filings_df:")
# print(filings_df[['ticker', 'accession_number', 'filing_date', 'filing_type', 'content_length']].head())

# print("\nFirst few rows of company_concepts_df:")
# print(company_concepts_df[['ticker', 'entity_name', 'cik']].head())
