In [1]:
import pandas as pd
import yfinance as yf
import numpy as np
from edgar import *
from bs4 import BeautifulSoup
import requests
from lxml import  etree
import json
import re

#### I want to scrape all of the list of 500 componies in S&P500 from wikipedia

In [2]:
# The URL of the website I am looking for
web_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
#I want the see the response of the host website so I can use BeautifulSoup package
response = requests.get(web_url)
#Creating the soup object where I am giving to it the response
soup_ob = BeautifulSoup(response.content)
#I am collecting all the HTML table data on the wikipedia page 
#and have noticed that the first table is the one I want
data_table = soup_ob.find_all('table','wikitable')[0]
# var. all_value contains the rowvalues of the companies
all_value = data_table.find_all('tr')

In [3]:
SP500 = pd.DataFrame(columns = ['Symbol','Security','Sector','Sub-Sector','HQ Location','CIK'])
ix = 0
for row in all_value[1:]:
    values = row.find_all('td')
    Symbol = values[0].text
    Security = values[1].text
    Sector = values[2].text
    Sub_Sector = values[3].text
    HQ = values[4].text
    CIK = values[6].text
    SP500.loc[ix] = [Symbol,Security,Sector,Sub_Sector,HQ,CIK]
    ix +=1
SP500['Symbol'] = SP500['Symbol'].str.replace('\n','')
SP500.head()


Unnamed: 0,Symbol,Security,Sector,Sub-Sector,HQ Location,CIK
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",66740
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",91142
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1800
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",1551152
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",1467373


#### Now I have the list of all of the S&P500 listed companies, their CIK key, Symbol and Security. 
- I can use the Symbol to download the price data from Yahoo Finance
- I can also download SEC fillings using the CIK code. This is possible thanks the Electronic Data Gathering, Analysis, and Retrieval (EDGAR) provided by the SEC
- The Sector and Sub-Sector is used to organize the data according to sector if we need any further data.

In this part I am initializing the EDGAR package, where I need to set my identity to SEC for trancperancy purposes, so I can download the legal documents online. 

In [4]:
#set my identity for the SEC (First name last name and email)
name = "Maseeh"
surname = "Faizan"
email = "maseehfaizan@gmail.com"
headers = {'User-Agent':f'{name} {surname} {email}'}

In [5]:
#Initially I need to keep the link and this link is xml for now but you can change it to html later. 
form = 'https://www.sec.gov/Archives/edgar/data/320193/000032019322000063/wf-form4_165248105838188.xml'
res = requests.get(form,headers=headers)
#This is an xml document so you need to be able to read it afterwards
print(res.content)

b'<?xml version="1.0"?>\n<ownershipDocument>\n\n    <schemaVersion>X0306</schemaVersion>\n\n    <documentType>4</documentType>\n\n    <periodOfReport>2022-05-06</periodOfReport>\n\n    <notSubjectToSection16>0</notSubjectToSection16>\n\n    <issuer>\n        <issuerCik>0000320193</issuerCik>\n        <issuerName>Apple Inc.</issuerName>\n        <issuerTradingSymbol>AAPL</issuerTradingSymbol>\n    </issuer>\n\n    <reportingOwner>\n        <reportingOwnerId>\n            <rptOwnerCik>0001182047</rptOwnerCik>\n            <rptOwnerName>BELL JAMES A</rptOwnerName>\n        </reportingOwnerId>\n        <reportingOwnerAddress>\n            <rptOwnerStreet1>ONE APPLE PARK WAY</rptOwnerStreet1>\n            <rptOwnerStreet2></rptOwnerStreet2>\n            <rptOwnerCity>CUPERTINO</rptOwnerCity>\n            <rptOwnerState>CA</rptOwnerState>\n            <rptOwnerZipCode>95014</rptOwnerZipCode>\n            <rptOwnerStateDescription></rptOwnerStateDescription>\n        </reportingOwnerAddress>\

This part of the code is helping me get the data of the company name, it's ticker and cik string that are available it the datastring. Using this I will be able to download any files in SEC's database. 
Notice that everything in the datastream is a dictionary, this means that I can loop through these dictionaries and find the actual data I am looking for

In [6]:
#Whenever you are requesting something from the sec include your identity or else the code will crash
tic_to_cik = requests.get('https://www.sec.gov/files/company_tickers.json',headers=headers).json()
for i in range(5):
    print(i,tic_to_cik[f'{i}'])

0 {'cik_str': 789019, 'ticker': 'MSFT', 'title': 'MICROSOFT CORP'}
1 {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'}
2 {'cik_str': 1045810, 'ticker': 'NVDA', 'title': 'NVIDIA CORP'}
3 {'cik_str': 1018724, 'ticker': 'AMZN', 'title': 'AMAZON COM INC'}
4 {'cik_str': 1652044, 'ticker': 'GOOGL', 'title': 'Alphabet Inc.'}


In [7]:
cik_lookup = dict([(val['ticker'], val['cik_str']) for key, val in tic_to_cik.items()])
cik = cik_lookup['NVDA']
cik

1045810

##### Remember that in the SEC files, they are all 10 digits long and if the CIK key is smaller (which is usually the case) EDGAR will simply add bunch of zeros in front. That is what we are doing at {cik:0>10} and turning it into json file.

{cik:0>10} is a string formatting expression. Let's break it down:

{cik}: This part of the expression is a placeholder for the variable cik within the f-string.
:0>10: This is the formatting specifier.
0: This specifies the character to use for padding. In this case, it's 0.
>: This is the alignment specifier. It specifies that the content should be right-aligned.
10: This specifies the width of the field. It means that the resulting string should be at least 10 characters wide. If the length of the variable cik is less than 10, it will be padded with zeros on the left to meet the specified width.


In [8]:
edgar_filings = requests.get(f"https://data.sec.gov/submissions/CIK{cik:0>10}.json", headers=headers).json()
edgar_filings.keys()

dict_keys(['cik', 'entityType', 'sic', 'sicDescription', 'insiderTransactionForOwnerExists', 'insiderTransactionForIssuerExists', 'name', 'tickers', 'exchanges', 'ein', 'description', 'website', 'investorWebsite', 'category', 'fiscalYearEnd', 'stateOfIncorporation', 'stateOfIncorporationDescription', 'addresses', 'phone', 'flags', 'formerNames', 'filings'])

We have date time in *filingdate and reportDate* but pandas doesn't know that they are date variables. That is why we need to change those into datetime variables

In [9]:
recents = pd.DataFrame(edgar_filings['filings']['recent'])
recents.head()

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0001045810-24-000041,2024-02-27,2024-02-23,2024-02-27T17:08:35.000Z,,4,,,,10191,0,0,xslF345X05/wk-form4_1709071698.xml,FORM 4
1,0001045810-24-000040,2024-02-27,2024-02-23,2024-02-27T17:06:39.000Z,,4,,,,8001,0,0,xslF345X05/wk-form4_1709071584.xml,FORM 4
2,0001045810-24-000038,2024-02-27,2023-12-13,2024-02-27T17:04:29.000Z,,4,,,,9290,0,0,xslF345X05/wk-form4_1709071452.xml,FORM 4
3,0001045810-24-000037,2024-02-27,2024-02-23,2024-02-27T17:02:34.000Z,,4,,,,40523,0,0,xslF345X05/wk-form4_1709071330.xml,FORM 4
4,0001968582-24-000130,2024-02-23,,2024-02-23T16:13:20.000Z,33.0,144,000-23985,24671308.0,,4535,0,0,xsl144X01/primary_doc.xml,


In this part we have file number but most importantly the form name.
- Form 4 is Insider trade; Someone inside the company is selling or buying the shares and need to disclose those
- Form 144 is a normal sale transaction that will surpace some amount, above $50'000 or more than 5000 shares. 
- Form 10-K is the file that has the end of the year transactions.

Furthermore we have accessionNumber with a few '-' that we don't want when we are retrieving the data.

In [10]:
recents['reportDate'] = pd.to_datetime(recents['reportDate'])
recents['filingDate'] = pd.to_datetime(recents['filingDate'])

recents.head(10)

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0001045810-24-000041,2024-02-27,2024-02-23,2024-02-27T17:08:35.000Z,,4,,,,10191,0,0,xslF345X05/wk-form4_1709071698.xml,FORM 4
1,0001045810-24-000040,2024-02-27,2024-02-23,2024-02-27T17:06:39.000Z,,4,,,,8001,0,0,xslF345X05/wk-form4_1709071584.xml,FORM 4
2,0001045810-24-000038,2024-02-27,2023-12-13,2024-02-27T17:04:29.000Z,,4,,,,9290,0,0,xslF345X05/wk-form4_1709071452.xml,FORM 4
3,0001045810-24-000037,2024-02-27,2024-02-23,2024-02-27T17:02:34.000Z,,4,,,,40523,0,0,xslF345X05/wk-form4_1709071330.xml,FORM 4
4,0001968582-24-000130,2024-02-23,NaT,2024-02-23T16:13:20.000Z,33.0,144,000-23985,24671308.0,,4535,0,0,xsl144X01/primary_doc.xml,
5,0002007317-24-000190,2024-02-23,NaT,2024-02-23T15:55:40.000Z,33.0,144,000-23985,24670973.0,,8017,0,0,xsl144X01/primary_doc.xml,
6,0002007317-24-000189,2024-02-23,NaT,2024-02-23T15:45:30.000Z,33.0,144,000-23985,24670937.0,,4915,0,0,xsl144X01/primary_doc.xml,
7,0002007317-24-000185,2024-02-23,NaT,2024-02-23T14:05:36.000Z,33.0,144,000-23985,24670076.0,,4308,0,0,xsl144X01/primary_doc.xml,
8,0001045810-24-000029,2024-02-21,2024-01-28,2024-02-21T16:36:57.000Z,34.0,10-K,000-23985,24660316.0,,11813809,1,1,nvda-20240128.htm,10-K
9,0001045810-24-000028,2024-02-21,2024-02-21,2024-02-21T16:22:09.000Z,34.0,8-K,000-23985,24659885.0,"2.02,9.01",803595,1,1,nvda-20240221.htm,8-K


In [11]:
tenk = recents[(recents['form']=='10-K')]

In [12]:
def make_url(cik, row):
    accessionNumber = row['accessionNumber'].replace("-", "")
    return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accessionNumber}/{row['accessionNumber']}.txt"
make_url(cik,tenk.iloc[0])

'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/0001045810-24-000029.txt'

In [13]:
tenk.iloc[0]

accessionNumber              0001045810-24-000029
filingDate                    2024-02-21 00:00:00
reportDate                    2024-01-28 00:00:00
acceptanceDateTime       2024-02-21T16:36:57.000Z
act                                            34
form                                         10-K
fileNumber                              000-23985
filmNumber                               24660316
items                                            
size                                     11813809
isXBRL                                          1
isInlineXBRL                                    1
primaryDocument                 nvda-20240128.htm
primaryDocDescription                        10-K
Name: 8, dtype: object

Parsing *HTML* data is easy, BeautifulSoup package will break the *HTML* code down and will helpt you structure the code more easily

In [14]:
req = requests.get(make_url(cik, tenk.iloc[0]), headers=headers)
soup = BeautifulSoup(req.content,'html.parser')
#I am looking into the first 300 characters 
req.content[0:500]

b'<SEC-DOCUMENT>0001045810-24-000029.txt : 20240221\n<SEC-HEADER>0001045810-24-000029.hdr.sgml : 20240221\n<ACCEPTANCE-DATETIME>20240221163657\nACCESSION NUMBER:\t\t0001045810-24-000029\nCONFORMED SUBMISSION TYPE:\t10-K\nPUBLIC DOCUMENT COUNT:\t\t114\nCONFORMED PERIOD OF REPORT:\t20240128\nFILED AS OF DATE:\t\t20240221\nDATE AS OF CHANGE:\t\t20240221\n\nFILER:\n\n\tCOMPANY DATA:\t\n\t\tCOMPANY CONFORMED NAME:\t\t\tNVIDIA CORP\n\t\tCENTRAL INDEX KEY:\t\t\t0001045810\n\t\tSTANDARD INDUSTRIAL CLASSIFICATION:\tSEMICONDUCTORS & RELATED DEVIC'

In [29]:
docs = soup.find_all(re.compile("table", re.IGNORECASE))

In [31]:
print(docs[0:100])

[<table style="border-collapse:collapse;display:inline-table;margin-bottom:5pt;vertical-align:text-bottom;width:93.750%"><tr><td style="width:1.0%"></td><td style="width:2.650%"></td><td style="width:0.1%"></td><td style="width:1.0%"></td><td style="width:95.150%"></td><td style="width:0.1%"></td></tr><tr><td colspan="3" style="padding:2px 1pt;text-align:left;vertical-align:top"><span style="color:#000000;font-family:'NVIDIA Sans',sans-serif;font-size:9pt;font-weight:400;line-height:100%"><ix:nonnumeric contextref="c-1" format="ixt:fixed-true" id="f-2" name="dei:DocumentAnnualReport">☒</ix:nonnumeric></span></td><td colspan="3" style="padding:2px 1pt;text-align:left;vertical-align:top"><span style="color:#000000;font-family:'NVIDIA Sans',sans-serif;font-size:9pt;font-weight:700;line-height:100%">ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</span></td></tr></table>, <table style="border-collapse:collapse;display:inline-table;margin-bottom:5pt;vert

In [17]:
data_table = soup.find_all('table')[10]
value = data_table.find_all('td')
# var. all_value contains the rowvalues of the companies

IndexError: list index out of range

In [None]:
docs = soup.find_all(re.compile("ownershipDocument", re.IGNORECASE))

In [None]:
print(docs)