# **Using EDGAR tools to download the 10k filings of the companies**

In [None]:
!pip install edgartools

In [2]:
from edgar import set_identity

set_identity("Mr Example examplemaster@gmail.com")

In [3]:
from edgar import Company

apple = Company("AAPL")

In [4]:
apple.financials #Financial Summary

                                                                                                                   
  [1;38;5;39m                                             Balance Sheet                                             [0m          
                                                                                                                   
   [1m [0m[1m                                          [0m[1m [0m [1m [0m[1m      2023-09-30[0m[1m [0m [1m [0m[1m      2022-09-24[0m[1m [0m [1m [0m[1m      2021-09-25[0m[1m [0m           
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━           
    [1;38;5;39mASSETS:                                   [0m                                                                     
    [1;38;5;39mCURRENT ASSETS:                           [0m                                                                     
      Cash and Cash Equivalents             

# **Getting the 10K filings**

In [5]:
filings = apple.get_filings(form="10-K") #Provides the 10K filings from 1995 to the latest
filings

╭──────────────────────────────────────── Filings for Apple Inc. [320193] ────────────────────────────────────────╮
│                                                                                                                 │
│  [1m [0m[1m  [0m[1m [0m [1;38;5;71m [0m[1;38;5;71mform[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39mfiled     [0m[1;38;5;39m [0m [1m [0m[1maccession_number    [0m[1m [0m [1m [0m[1mxbrl[0m[1m [0m                                                         │
│  ──────────────────────────────────────────────────────                                                         │
│  [1m [0m[1m0 [0m[1m [0m [1;38;5;71m [0m[1;38;5;71m10-K[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39m2023-11-03[0m[1;38;5;39m [0m [1m [0m[1m0000320193-23-000106[0m[1m [0m [1m [0m[1m✓   [0m[1m [0m                                                         │
│   1   [38;5;71m [0m[38;5;71m10-K[0m[38;5;71m [0m [38;5;39m [0m[38;5;39m20

In [6]:
#Converting the filing to pandas
df = filings.to_pandas()
df

Unnamed: 0,accession_number,filing_date,reportDate,acceptanceDateTime,act,form,fileNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0000320193-23-000106,2023-11-03,2023-09-30,2023-11-02T18:08:27.000Z,34.0,10-K,001-36743,,9569569,1,1,aapl-20230930.htm,10-K
1,0000320193-22-000108,2022-10-28,2022-09-24,2022-10-27T18:01:14.000Z,34.0,10-K,001-36743,,10332356,1,1,aapl-20220924.htm,10-K
2,0000320193-21-000105,2021-10-29,2021-09-25,2021-10-28T18:04:28.000Z,34.0,10-K,001-36743,,10502096,1,1,aapl-20210925.htm,10-K
3,0000320193-20-000096,2020-10-30,2020-09-26,2020-10-29T18:06:25.000Z,34.0,10-K,001-36743,,12502600,1,1,aapl-20200926.htm,10-K
4,0000320193-19-000119,2019-10-31,2019-09-28,2019-10-30T18:12:36.000Z,34.0,10-K,001-36743,,12861616,1,1,a10-k20199282019.htm,10-K
5,0000320193-18-000145,2018-11-05,2018-09-29,2018-11-05T08:01:40.000Z,34.0,10-K,001-36743,,12275572,1,0,a10-k20189292018.htm,10-K
6,0000320193-17-000070,2017-11-03,2017-09-30,2017-11-03T08:01:37.000Z,34.0,10-K,001-36743,,14071062,1,0,a10-k20179302017.htm,10-K
7,0001628280-16-020309,2016-10-26,2016-09-24,2016-10-26T16:42:16.000Z,34.0,10-K,001-36743,,13277662,1,0,a201610-k9242016.htm,10-K
8,0001193125-15-356351,2015-10-28,2015-09-26,2015-10-28T16:31:09.000Z,34.0,10-K,001-36743,,9594425,1,0,d17062d10k.htm,FORM 10-K
9,0001193125-14-383437,2014-10-27,2014-09-27,2014-10-27T17:11:55.000Z,34.0,10-K,000-10030,,12082626,1,0,d783162d10k.htm,10-K


In [7]:
#Retrieving the 5 latest filings (For app usecase)
five_latest_10K = filings.head(5)
five_latest_10K

╭──────────────────────────────────────── Filings for Apple Inc. [320193] ────────────────────────────────────────╮
│                                                                                                                 │
│  [1m [0m[1m [0m[1m [0m [1;38;5;71m [0m[1;38;5;71mform[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39mfiled     [0m[1;38;5;39m [0m [1m [0m[1maccession_number    [0m[1m [0m [1m [0m[1mxbrl[0m[1m [0m                                                          │
│  ─────────────────────────────────────────────────────                                                          │
│  [1m [0m[1m0[0m[1m [0m [1;38;5;71m [0m[1;38;5;71m10-K[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39m2023-11-03[0m[1;38;5;39m [0m [1m [0m[1m0000320193-23-000106[0m[1m [0m [1m [0m[1m✓   [0m[1m [0m                                                          │
│   1  [38;5;71m [0m[38;5;71m10-K[0m[38;5;71m [0m [38;5;39m [0m[38;5;39m202

In [8]:
fl = five_latest_10K.to_pandas()
fl

Unnamed: 0,accession_number,filing_date,reportDate,acceptanceDateTime,act,form,fileNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0000320193-23-000106,2023-11-03,2023-09-30,2023-11-02T18:08:27.000Z,34,10-K,001-36743,,9569569,1,1,aapl-20230930.htm,10-K
1,0000320193-22-000108,2022-10-28,2022-09-24,2022-10-27T18:01:14.000Z,34,10-K,001-36743,,10332356,1,1,aapl-20220924.htm,10-K
2,0000320193-21-000105,2021-10-29,2021-09-25,2021-10-28T18:04:28.000Z,34,10-K,001-36743,,10502096,1,1,aapl-20210925.htm,10-K
3,0000320193-20-000096,2020-10-30,2020-09-26,2020-10-29T18:06:25.000Z,34,10-K,001-36743,,12502600,1,1,aapl-20200926.htm,10-K
4,0000320193-19-000119,2019-10-31,2019-09-28,2019-10-30T18:12:36.000Z,34,10-K,001-36743,,12861616,1,1,a10-k20199282019.htm,10-K


In [None]:
#Code to download the 10K filings as pdfs
!pip install pdfkit
!pip install weasyprint

In [15]:
import requests
import os
import time
from weasyprint import HTML

def download_primary_documents(df, download_dir):
    """Downloads primary documents as PDF files from SEC EDGAR filings,
       converting HTML content to PDF using WeasyPrint.

    Args:
        df (pd.DataFrame): DataFrame containing filing information with columns:
            - accession_number: Accession number of the filing.
            - primaryDocument: Name of the primary document file.
        download_dir (str): Directory to save downloaded PDF files.

    Returns:
        None
    """

    base_url = "https://www.sec.gov/Archives/edgar/data/"

    headers = {
        'User-Agent': "Mozilla/5.0"
    }  # Headers to mimic a browser

    for index, row in df.iterrows():
        accession_number = row['accession_number']
        primary_document = row['primaryDocument']

        # Extract company CIK from accession number
        cik = accession_number.split("-")[0]

        # Construct URL directly using primaryDocument
        doc_url = f"{base_url}{cik}/{accession_number.replace('-', '')}/{primary_document}"

        # Download and convert to PDF
        try:
            response = requests.get(doc_url, headers=headers)
            response.raise_for_status()  # Raise an error for bad status codes

            html = HTML(string=response.text)
            filename = f"{accession_number}_{primary_document.replace('.htm', '.pdf')}"
            html.write_pdf(os.path.join(download_dir, filename))

            print(f"Downloaded {filename}")
        except Exception as e:
            print(f"Error downloading or converting {doc_url}: {e}")

        time.sleep(5)  # Add a 5-second delay between requests

# Assuming your DataFrame is named 'df' (replace with your actual DataFrame)
df = fl
download_dir = "content"  # Directory to save downloaded PDF files

if not os.path.exists(download_dir):
    os.makedirs(download_dir)

download_primary_documents(df, download_dir)

ERROR:weasyprint:Relative URI reference without a base URI: <img src="aapl-20230930_g1.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="aapl-20230930_g2.jpg">


Downloaded 0000320193-23-000106_aapl-20230930.pdf


ERROR:weasyprint:Relative URI reference without a base URI: <img src="aapl-20220924_g1.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="aapl-20220924_g2.jpg">


Downloaded 0000320193-22-000108_aapl-20220924.pdf


ERROR:weasyprint:Relative URI reference without a base URI: <img src="aapl-20210925_g1.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="aapl-20210925_g2.jpg">


Downloaded 0000320193-21-000105_aapl-20210925.pdf


ERROR:weasyprint:Relative URI reference without a base URI: <img src="aapl-20200926_g1.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="aapl-20200926_g2.jpg">


Downloaded 0000320193-20-000096_aapl-20200926.pdf


ERROR:weasyprint:Relative URI reference without a base URI: <img src="g66145g66i38.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="chart-06f17b4ddd9352dc8dc.jpg">


Downloaded 0000320193-19-000119_a10-k20199282019.pdf


# **Google 10K Filings**

In [16]:
Google = Company("GOOG")

In [17]:
Google.financials

                                                                                                                   
  [1;38;5;39m                                             Balance Sheet                                             [0m          
                                                                                                                   
   [1m [0m[1m                                          [0m[1m [0m [1m [0m[1m      2023-12-31[0m[1m [0m [1m [0m[1m      2022-12-31[0m[1m [0m [1m [0m[1m      2021-12-31[0m[1m [0m           
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━           
    [1;38;5;39mASSETS:                                   [0m                                                                     
    [1;38;5;39mCURRENT ASSETS:                           [0m                                                                     
      Cash and Cash Equivalents             

In [18]:
filings = Google.get_filings(form="10-K")
filings

╭────────────────────────────────────── Filings for Alphabet Inc. [1652044] ──────────────────────────────────────╮
│                                                                                                                 │
│  [1m [0m[1m [0m[1m [0m [1;38;5;71m [0m[1;38;5;71mform[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39mfiled     [0m[1;38;5;39m [0m [1m [0m[1maccession_number    [0m[1m [0m [1m [0m[1mxbrl[0m[1m [0m                                                          │
│  ─────────────────────────────────────────────────────                                                          │
│  [1m [0m[1m0[0m[1m [0m [1;38;5;71m [0m[1;38;5;71m10-K[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39m2024-01-31[0m[1;38;5;39m [0m [1m [0m[1m0001652044-24-000022[0m[1m [0m [1m [0m[1m✓   [0m[1m [0m                                                          │
│   1  [38;5;71m [0m[38;5;71m10-K[0m[38;5;71m [0m [38;5;39m [0m[38;5;39m202

In [19]:
filings.to_pandas()

Unnamed: 0,accession_number,filing_date,reportDate,acceptanceDateTime,act,form,fileNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0001652044-24-000022,2024-01-31,2023-12-31,2024-01-30T21:43:43.000Z,34,10-K,001-37580,,13927595,1,1,goog-20231231.htm,10-K
1,0001652044-23-000016,2023-02-03,2022-12-31,2023-02-02T21:23:45.000Z,34,10-K,001-37580,,15264470,1,1,goog-20221231.htm,10-K
2,0001652044-22-000019,2022-02-02,2021-12-31,2022-02-01T21:08:02.000Z,34,10-K,001-37580,,15044932,1,1,goog-20211231.htm,10-K
3,0001652044-21-000010,2021-02-03,2020-12-31,2021-02-02T20:12:25.000Z,34,10-K,001-37580,,14948616,1,1,goog-20201231.htm,10-K
4,0001652044-20-000008,2020-02-04,2019-12-31,2020-02-03T21:03:59.000Z,34,10-K,001-37580,,16583126,1,1,goog10-k2019.htm,10-K
5,0001652044-19-000004,2019-02-05,2018-12-31,2019-02-04T21:06:38.000Z,34,10-K,001-37580,,14192672,1,0,goog10-kq42018.htm,10-K
6,0001652044-18-000007,2018-02-06,2017-12-31,2018-02-05T20:46:29.000Z,34,10-K,001-37580,,14110886,1,0,goog10-kq42017.htm,FORM 10-K
7,0001652044-17-000008,2017-02-03,2016-12-31,2017-02-02T18:13:47.000Z,34,10-K,001-37580,,16051851,1,0,goog10-kq42016.htm,FORM 10-K
8,0001652044-16-000012,2016-02-11,2015-12-31,2016-02-11T16:38:35.000Z,34,10-K,001-37580,,19416002,1,0,goog10-k2015.htm,FORM 10-K


In [20]:
five_latest_10K = filings.head(5)
five_latest_10K

╭────────────────────────────────────── Filings for Alphabet Inc. [1652044] ──────────────────────────────────────╮
│                                                                                                                 │
│  [1m [0m[1m [0m[1m [0m [1;38;5;71m [0m[1;38;5;71mform[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39mfiled     [0m[1;38;5;39m [0m [1m [0m[1maccession_number    [0m[1m [0m [1m [0m[1mxbrl[0m[1m [0m                                                          │
│  ─────────────────────────────────────────────────────                                                          │
│  [1m [0m[1m0[0m[1m [0m [1;38;5;71m [0m[1;38;5;71m10-K[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39m2024-01-31[0m[1;38;5;39m [0m [1m [0m[1m0001652044-24-000022[0m[1m [0m [1m [0m[1m✓   [0m[1m [0m                                                          │
│   1  [38;5;71m [0m[38;5;71m10-K[0m[38;5;71m [0m [38;5;39m [0m[38;5;39m202

In [21]:
fl = five_latest_10K.to_pandas()
fl

Unnamed: 0,accession_number,filing_date,reportDate,acceptanceDateTime,act,form,fileNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0001652044-24-000022,2024-01-31,2023-12-31,2024-01-30T21:43:43.000Z,34,10-K,001-37580,,13927595,1,1,goog-20231231.htm,10-K
1,0001652044-23-000016,2023-02-03,2022-12-31,2023-02-02T21:23:45.000Z,34,10-K,001-37580,,15264470,1,1,goog-20221231.htm,10-K
2,0001652044-22-000019,2022-02-02,2021-12-31,2022-02-01T21:08:02.000Z,34,10-K,001-37580,,15044932,1,1,goog-20211231.htm,10-K
3,0001652044-21-000010,2021-02-03,2020-12-31,2021-02-02T20:12:25.000Z,34,10-K,001-37580,,14948616,1,1,goog-20201231.htm,10-K
4,0001652044-20-000008,2020-02-04,2019-12-31,2020-02-03T21:03:59.000Z,34,10-K,001-37580,,16583126,1,1,goog10-k2019.htm,10-K


In [22]:
import requests
import os
import time
from weasyprint import HTML

def download_primary_documents(df, download_dir):
    """Downloads primary documents as PDF files from SEC EDGAR filings,
       converting HTML content to PDF using WeasyPrint.

    Args:
        df (pd.DataFrame): DataFrame containing filing information with columns:
            - accession_number: Accession number of the filing.
            - primaryDocument: Name of the primary document file.
        download_dir (str): Directory to save downloaded PDF files.

    Returns:
        None
    """

    base_url = "https://www.sec.gov/Archives/edgar/data/"

    headers = {
        'User-Agent': "Mozilla/5.0"
    }  # Headers to mimic a browser

    for index, row in df.iterrows():
        accession_number = row['accession_number']
        primary_document = row['primaryDocument']

        # Extract company CIK from accession number
        cik = accession_number.split("-")[0]

        # Construct URL directly using primaryDocument
        doc_url = f"{base_url}{cik}/{accession_number.replace('-', '')}/{primary_document}"

        # Download and convert to PDF
        try:
            response = requests.get(doc_url, headers=headers)
            response.raise_for_status()  # Raise an error for bad status codes

            html = HTML(string=response.text)
            filename = f"{accession_number}_{primary_document.replace('.htm', '.pdf')}"
            html.write_pdf(os.path.join(download_dir, filename))

            print(f"Downloaded {filename}")
        except Exception as e:
            print(f"Error downloading or converting {doc_url}: {e}")

        time.sleep(5)  # Add a 5-second delay between requests

# Assuming your DataFrame is named 'df' (replace with your actual DataFrame)
df = fl
download_dir = "GOOGLE"  # Directory to save downloaded PDF files

if not os.path.exists(download_dir):
    os.makedirs(download_dir)

download_primary_documents(df, download_dir)

ERROR:weasyprint:Relative URI reference without a base URI: <img src="goog-20231231_g1.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="goog-20231231_g2.jpg">


Downloaded 0001652044-24-000022_goog-20231231.pdf


ERROR:weasyprint:Relative URI reference without a base URI: <img src="goog-20221231_g1.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="goog-20221231_g2.jpg">


Downloaded 0001652044-23-000016_goog-20221231.pdf


ERROR:weasyprint:Relative URI reference without a base URI: <img src="goog-20211231_g1.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="goog-20211231_g2.jpg">


Downloaded 0001652044-22-000019_goog-20211231.pdf


ERROR:weasyprint:Relative URI reference without a base URI: <img src="goog-20201231_g1.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="goog-20201231_g2.jpg">


Downloaded 0001652044-21-000010_goog-20201231.pdf


ERROR:weasyprint:Relative URI reference without a base URI: <img src="chart-684fc4fa3c835ff7905.jpg">
ERROR:weasyprint:Relative URI reference without a base URI: <img src="chart-82b0fb3f451859028be.jpg">


Downloaded 0001652044-20-000008_goog10-k2019.pdf


# **Amazon 10k filings**

In [23]:
am = Company("AMZN")

In [24]:
am.financials

                                                                                                                   
  [1;38;5;39m                                             Balance Sheet                                             [0m          
                                                                                                                   
   [1m [0m[1m                                          [0m[1m [0m [1m [0m[1m      2023-12-31[0m[1m [0m [1m [0m[1m      2022-12-31[0m[1m [0m [1m [0m[1m      2021-12-31[0m[1m [0m           
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━           
    [1;38;5;39mASSETS:                                   [0m                                                                     
    [1;38;5;39mCURRENT ASSETS:                           [0m                                                                     
      Cash and Cash Equivalents             

In [25]:
filings = am.get_filings(form="10-K")
filings

╭───────────────────────────────────── Filings for AMAZON COM INC [1018724] ──────────────────────────────────────╮
│                                                                                                                 │
│  [1m [0m[1m  [0m[1m [0m [1;38;5;71m [0m[1;38;5;71mform[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39mfiled     [0m[1;38;5;39m [0m [1m [0m[1maccession_number    [0m[1m [0m [1m [0m[1mxbrl[0m[1m [0m                                                         │
│  ──────────────────────────────────────────────────────                                                         │
│  [1m [0m[1m0 [0m[1m [0m [1;38;5;71m [0m[1;38;5;71m10-K[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39m2024-02-02[0m[1;38;5;39m [0m [1m [0m[1m0001018724-24-000008[0m[1m [0m [1m [0m[1m✓   [0m[1m [0m                                                         │
│   1   [38;5;71m [0m[38;5;71m10-K[0m[38;5;71m [0m [38;5;39m [0m[38;5;39m20

In [26]:
filings.to_pandas()

Unnamed: 0,accession_number,filing_date,reportDate,acceptanceDateTime,act,form,fileNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0001018724-24-000008,2024-02-02,2023-12-31,2024-02-01T18:48:30.000Z,34.0,10-K,000-22513,,12110804,1,1,amzn-20231231.htm,10-K
1,0001018724-23-000004,2023-02-03,2022-12-31,2023-02-02T18:27:34.000Z,34.0,10-K,000-22513,,12809287,1,1,amzn-20221231.htm,10-K
2,0001018724-22-000005,2022-02-04,2021-12-31,2022-02-03T18:46:51.000Z,34.0,10-K,000-22513,,13441589,1,1,amzn-20211231.htm,10-K
3,0001018724-21-000004,2021-02-03,2020-12-31,2021-02-02T19:44:10.000Z,34.0,10-K,000-22513,,12093058,1,1,amzn-20201231.htm,10-K
4,0001018724-20-000004,2020-01-31,2019-12-31,2020-01-30T20:46:13.000Z,34.0,10-K,000-22513,,13336980,1,1,amzn-20191231x10k.htm,10-K
5,0001018724-19-000004,2019-02-01,2018-12-31,2019-01-31T20:22:40.000Z,34.0,10-K,000-22513,,11900762,1,0,amzn-20181231x10k.htm,10-K
6,0001018724-18-000005,2018-02-02,2017-12-31,2018-02-01T20:41:15.000Z,34.0,10-K,000-22513,,12211807,1,0,amzn-20171231x10k.htm,10-K
7,0001018724-17-000011,2017-02-10,2016-12-31,2017-02-09T17:56:36.000Z,34.0,10-K,000-22513,,11741549,1,0,amzn-20161231x10k.htm,FORM 10-K
8,0001018724-16-000172,2016-01-29,2015-12-31,2016-01-28T19:38:11.000Z,34.0,10-K,000-22513,,11352950,1,0,amzn-20151231x10k.htm,FORM 10-K
9,0001018724-15-000006,2015-01-30,2014-12-31,2015-01-29T19:38:08.000Z,34.0,10-K,000-22513,,15432046,1,0,amzn-20141231x10k.htm,FORM 10-K


In [27]:
five_latest_10K = filings.head(5)
five_latest_10K

╭───────────────────────────────────── Filings for AMAZON COM INC [1018724] ──────────────────────────────────────╮
│                                                                                                                 │
│  [1m [0m[1m [0m[1m [0m [1;38;5;71m [0m[1;38;5;71mform[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39mfiled     [0m[1;38;5;39m [0m [1m [0m[1maccession_number    [0m[1m [0m [1m [0m[1mxbrl[0m[1m [0m                                                          │
│  ─────────────────────────────────────────────────────                                                          │
│  [1m [0m[1m0[0m[1m [0m [1;38;5;71m [0m[1;38;5;71m10-K[0m[1;38;5;71m [0m [1;38;5;39m [0m[1;38;5;39m2024-02-02[0m[1;38;5;39m [0m [1m [0m[1m0001018724-24-000008[0m[1m [0m [1m [0m[1m✓   [0m[1m [0m                                                          │
│   1  [38;5;71m [0m[38;5;71m10-K[0m[38;5;71m [0m [38;5;39m [0m[38;5;39m202

In [28]:
fl = five_latest_10K.to_pandas()
fl

Unnamed: 0,accession_number,filing_date,reportDate,acceptanceDateTime,act,form,fileNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0001018724-24-000008,2024-02-02,2023-12-31,2024-02-01T18:48:30.000Z,34,10-K,000-22513,,12110804,1,1,amzn-20231231.htm,10-K
1,0001018724-23-000004,2023-02-03,2022-12-31,2023-02-02T18:27:34.000Z,34,10-K,000-22513,,12809287,1,1,amzn-20221231.htm,10-K
2,0001018724-22-000005,2022-02-04,2021-12-31,2022-02-03T18:46:51.000Z,34,10-K,000-22513,,13441589,1,1,amzn-20211231.htm,10-K
3,0001018724-21-000004,2021-02-03,2020-12-31,2021-02-02T19:44:10.000Z,34,10-K,000-22513,,12093058,1,1,amzn-20201231.htm,10-K
4,0001018724-20-000004,2020-01-31,2019-12-31,2020-01-30T20:46:13.000Z,34,10-K,000-22513,,13336980,1,1,amzn-20191231x10k.htm,10-K


In [29]:
import requests
import os
import time
from weasyprint import HTML

def download_primary_documents(df, download_dir):
    """Downloads primary documents as PDF files from SEC EDGAR filings,
       converting HTML content to PDF using WeasyPrint.

    Args:
        df (pd.DataFrame): DataFrame containing filing information with columns:
            - accession_number: Accession number of the filing.
            - primaryDocument: Name of the primary document file.
        download_dir (str): Directory to save downloaded PDF files.

    Returns:
        None
    """

    base_url = "https://www.sec.gov/Archives/edgar/data/"

    headers = {
        'User-Agent': "Mozilla/5.0"
    }  # Headers to mimic a browser

    for index, row in df.iterrows():
        accession_number = row['accession_number']
        primary_document = row['primaryDocument']

        # Extract company CIK from accession number
        cik = accession_number.split("-")[0]

        # Construct URL directly using primaryDocument
        doc_url = f"{base_url}{cik}/{accession_number.replace('-', '')}/{primary_document}"

        # Download and convert to PDF
        try:
            response = requests.get(doc_url, headers=headers)
            response.raise_for_status()  # Raise an error for bad status codes

            html = HTML(string=response.text)
            filename = f"{accession_number}_{primary_document.replace('.htm', '.pdf')}"
            html.write_pdf(os.path.join(download_dir, filename))

            print(f"Downloaded {filename}")
        except Exception as e:
            print(f"Error downloading or converting {doc_url}: {e}")

        time.sleep(5)  # Add a 5-second delay between requests

# Assuming your DataFrame is named 'df' (replace with your actual DataFrame)
df = fl
download_dir = "content"  # Directory to save downloaded PDF files

if not os.path.exists(download_dir):
    os.makedirs(download_dir)

download_primary_documents(df, download_dir)

Downloaded 0001018724-24-000008_amzn-20231231.pdf
Downloaded 0001018724-23-000004_amzn-20221231.pdf
Downloaded 0001018724-22-000005_amzn-20211231.pdf
Downloaded 0001018724-21-000004_amzn-20201231.pdf
Downloaded 0001018724-20-000004_amzn-20191231x10k.pdf
