In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# Define the URL for the SEC EDGAR search page
url = "https://www.sec.gov/files/company_tickers_exchange.json"

headers = {
   "User-Agent": "your.email@email.com"
}
# Fetch the HTML content of the search page
response = requests.get(url,headers=headers)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

json_url = f"https://data.sec.gov/api/xbrl/companyconcept/CIK0001652044/us-gaap/AccountsPayableCurrent.json"
json_response = requests.get(json_url,headers=headers)
json_content = json_response.json()
print(json_content)
print(soup.prettify())



{'cik': 1652044, 'taxonomy': 'us-gaap', 'tag': 'AccountsPayableCurrent', 'label': 'Accounts Payable, Current', 'description': "Carrying value as of the balance sheet date of liabilities incurred (and for which invoices have typically been received) and payable to vendors for goods and services received that are used in an entity's business. Used to reflect the current portion of the liabilities (due within one year or within the normal operating cycle if longer).", 'entityName': 'Alphabet Inc.', 'units': {'USD': [{'end': '2014-12-31', 'val': 1715000000, 'accn': '0001652044-15-000005', 'fy': 2015, 'fp': 'Q3', 'form': '10-Q', 'filed': '2015-10-29'}, {'end': '2014-12-31', 'val': 1715000000, 'accn': '0001652044-16-000012', 'fy': 2015, 'fp': 'FY', 'form': '10-K', 'filed': '2016-02-11'}, {'end': '2014-12-31', 'val': 1715000000, 'accn': '0001652044-16-000022', 'fy': 2015, 'fp': 'FY', 'form': '8-K', 'filed': '2016-05-03', 'frame': 'CY2014Q4I'}, {'end': '2015-09-30', 'val': 1549000000, 'accn': 

In [2]:


# Parse the JSON content from the BeautifulSoup object
json_data = json.loads(soup.text)

# Convert the JSON data into a pandas DataFrame
df_cik_lookup = pd.DataFrame(json_data['data'], columns=json_data['fields'])


# df_cik_lookup['cik']=df_cik_lookup['cik'].astype(str).zfill(10)
df_cik_lookup

Unnamed: 0,cik,name,ticker,exchange
0,320193,Apple Inc.,AAPL,Nasdaq
1,1045810,NVIDIA CORP,NVDA,Nasdaq
2,789019,MICROSOFT CORP,MSFT,Nasdaq
3,1652044,Alphabet Inc.,GOOGL,Nasdaq
4,1018724,AMAZON COM INC,AMZN,Nasdaq
...,...,...,...,...
10190,2024203,Cayson Acquisition Corp,CAPNU,Nasdaq
10191,2025065,Black Spade Acquisition II Co,BSIIU,Nasdaq
10192,2025065,Black Spade Acquisition II Co,BSIIW,Nasdaq
10193,2025341,Andretti Acquisition Corp. II,POLEU,Nasdaq


In [3]:
json_data_list = []

# Define headers for the request
headers = {
    'User-Agent': 'Your Name (your_email@example.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'data.sec.gov'
}

# Loop through the first 100 CIKs
for i in range(100):
    cik = df_cik_lookup['cik'][i]
    cik=str(cik).zfill(10)
    print(cik)
    json_url = f"https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/AccountsPayableCurrent.json"
    json_response = requests.get(json_url, headers=headers)
    
    # Check if the request was successful
    if json_response.status_code == 200:
        json_content = json_response.json()
        json_data_list.append(json_content)
    else:
        print(f"Failed to fetch data for CIK: {cik}")

# Convert the list of JSON content into a DataFrame
df_json_content = pd.DataFrame(json_data_list)
df_json_content.to_csv('data.csv', index=False)


0000320193
0001045810
0000789019
0001652044
0001018724
0001326801
0001067983
Failed to fetch data for CIK: 0001067983
0001046179
Failed to fetch data for CIK: 0001046179
0001730168
0000059478
0001318605
0000104169
0000019617
Failed to fetch data for CIK: 0000019617
0000313838
0000034088
Failed to fetch data for CIK: 0000034088
0000731766
Failed to fetch data for CIK: 0000731766
0001403161
0000884394
Failed to fetch data for CIK: 0000884394
0000353278
Failed to fetch data for CIK: 0000353278
0001341439
0001141391
0000354950
0000080424
0000909832
0000200406
0001551152
0000937966
0000070858
Failed to fetch data for CIK: 0000070858
0001065280
0000021344
Failed to fetch data for CIK: 0000021344
0000002488
0001108524
0000310158
0000093410
0001000184
Failed to fetch data for CIK: 0001000184
0001577552
Failed to fetch data for CIK: 0001577552
0001283699
Failed to fetch data for CIK: 0001283699
0000901832
Failed to fetch data for CIK: 0000901832
0000077476
0001094517
0001114448
Failed to fetch 

In [None]:
df_json_content

In [5]:
df_units=pd.json_normalize(df_json_content['units'])

In [None]:
df_units

In [None]:
# Define a function to extract the 'val' field from the JSON data
def extract_val(json_data):
    normalized_data = pd.json_normalize(json_data)
    filtered_data = normalized_data[normalized_data['form'] == '10-K']
    return filtered_data['val'].tolist()

def extract_filled(json_data):
    normalized_data = pd.json_normalize(json_data)
    filtered_data = normalized_data[normalized_data['form'] == '10-K']
    return filtered_data['end'].tolist()

# Apply the function to the 'USD' column and create a new column 'value'
df_json_content['value'] = df_units['USD'].apply(lambda x: extract_val(x) if isinstance(x, list) else [])
df_json_content['dates_filed'] = df_units['USD'].apply(lambda x: extract_filled(x) if isinstance(x, list) else [])

# Display the updated DataFrame
df_json_content


In [None]:

import matplotlib.pyplot as plt

# Plot graphs of value for the first 5 companies
for i in range(5):
    company_name = df_json_content['entityName'][i]
    values = df_json_content['value'][i]
    time = df_json_content['dates_filed'][i]
    
    plt.figure(figsize=(10, 5))
    plt.plot(time, values, marker='o')
    plt.title(f'Value Over Time for {company_name}')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.ylim(min(values) * 0.9, max(values) * 1.1)  # Set y-axis limits with some padding
    plt.grid(True)
    plt.show()

In [None]:
json_data_list = []

# Define headers for the request
headers = {
    'User-Agent': 'Your Name (your_email@example.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'data.sec.gov'
}

# Loop through the first 100 CIKs
for i in range(100):
    cik = df_cik_lookup['cik'][i]
    cik=str(cik).zfill(10)
    print(cik)
    json_url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    json_response = requests.get(json_url, headers=headers)
    
    # Check if the request was successful
    if json_response.status_code == 200:
        json_content = json_response.json()
        json_data_list.append(json_content)
    else:
        print(f"Failed to fetch data for CIK: {cik}")

# Convert the list of JSON content into a DataFrame
df_json_companyfacts = pd.DataFrame(json_data_list)
df_json_companyfacts.to_csv('data2.csv', index=False)

In [None]:
df_json_companyfacts

In [19]:
df_facts=pd.json_normalize(df_json_companyfacts['facts'])

In [None]:
df_facts

In [None]:
df_facts.columns

In [None]:
url="https://data.sec.gov/submissions/CIK0000822663.json"
# Fetch the JSON content from the URL
response = requests.get(url,headers=headers)
json_content = response.json()

# Convert JSON content to a string
json_str = json.dumps(json_content)

# Parse the string using BeautifulSoup
soup = BeautifulSoup(json_str, 'html.parser')

# Print the parsed content
print(soup.prettify())


In [None]:
print(json_content.keys())

In [None]:
print(json_content['fiscalYearEnd'])