In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# Define the URL for the SEC EDGAR search page
url = "https://www.sec.gov/files/company_tickers_exchange.json"

headers = {
   "User-Agent": "your.email@email.com"
}
# Fetch the HTML content of the search page
response = requests.get(url,headers=headers)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

json_url = f"https://data.sec.gov/api/xbrl/companyconcept/CIK0001652044/us-gaap/AccountsPayableCurrent.json"
json_response = requests.get(json_url,headers=headers)
json_content = json_response.json()
print(json_content)
print(soup.prettify())

In [None]:


# Parse the JSON content from the BeautifulSoup object
json_data = json.loads(soup.text)

# Convert the JSON data into a pandas DataFrame
df_cik_lookup = pd.DataFrame(json_data['data'], columns=json_data['fields'])


# df_cik_lookup['cik']=df_cik_lookup['cik'].astype(str).zfill(10)
df_cik_lookup

In [None]:
json_data_list = []

# Define headers for the request
headers = {
    'User-Agent': 'Your Name (your_email@example.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'data.sec.gov'
}

# Loop through the first 100 CIKs
for i in range(100):
    cik = df_cik_lookup['cik'][i]
    cik=str(cik).zfill(10)
    print(cik)
    json_url = f"https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/AccountsPayableCurrent.json"
    json_response = requests.get(json_url, headers=headers)
    
    # Check if the request was successful
    if json_response.status_code == 200:
        json_content = json_response.json()
        json_data_list.append(json_content)
    else:
        print(f"Failed to fetch data for CIK: {cik}")

# Convert the list of JSON content into a DataFrame
df_json_content = pd.DataFrame(json_data_list)
df_json_content.to_csv('data.csv', index=False)


In [None]:
df_json_content

In [5]:
df_units=pd.json_normalize(df_json_content['units'])

In [None]:
df_units

In [None]:
# Define a function to extract the 'val' field from the JSON data
def extract_val(json_data):
    normalized_data = pd.json_normalize(json_data)
    filtered_data = normalized_data[normalized_data['form'] == '10-K']
    return filtered_data['val'].tolist()

def extract_filled(json_data):
    normalized_data = pd.json_normalize(json_data)
    filtered_data = normalized_data[normalized_data['form'] == '10-K']
    return filtered_data['end'].tolist()

# Apply the function to the 'USD' column and create a new column 'value'
df_json_content['value'] = df_units['USD'].apply(lambda x: extract_val(x) if isinstance(x, list) else [])
df_json_content['dates_filed'] = df_units['USD'].apply(lambda x: extract_filled(x) if isinstance(x, list) else [])

# Display the updated DataFrame
df_json_content


In [None]:

import matplotlib.pyplot as plt

# Plot graphs of value for the first 5 companies
for i in range(5):
    company_name = df_json_content['entityName'][i]
    values = df_json_content['value'][i]
    time = df_json_content['dates_filed'][i]
    
    plt.figure(figsize=(10, 5))
    plt.plot(time, values, marker='o')
    plt.title(f'Value Over Time for {company_name}')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.ylim(min(values) * 0.9, max(values) * 1.1)  # Set y-axis limits with some padding
    plt.grid(True)
    plt.show()

In [None]:
json_data_list = []

# Define headers for the request
headers = {
    'User-Agent': 'Your Name (your_email@example.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'data.sec.gov'
}

# Loop through the first 100 CIKs
for i in range(100):
    cik = df_cik_lookup['cik'][i]
    cik=str(cik).zfill(10)
    print(cik)
    json_url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    json_response = requests.get(json_url, headers=headers)
    
    # Check if the request was successful
    if json_response.status_code == 200:
        json_content = json_response.json()
        json_data_list.append(json_content)
    else:
        print(f"Failed to fetch data for CIK: {cik}")

# Convert the list of JSON content into a DataFrame
df_json_companyfacts = pd.DataFrame(json_data_list)
df_json_companyfacts.to_csv('data2.csv', index=False)

In [None]:
df_json_companyfacts

In [19]:
df_facts=pd.json_normalize(df_json_companyfacts['facts'])

In [None]:
df_facts

In [None]:
df_facts.columns

In [None]:
url="https://data.sec.gov/submissions/CIK0000822663.json"
# Fetch the JSON content from the URL
response = requests.get(url,headers=headers)
json_content = response.json()

# Convert JSON content to a string
json_str = json.dumps(json_content)

# Parse the string using BeautifulSoup
soup = BeautifulSoup(json_str, 'html.parser')

# Print the parsed content
print(soup.prettify())


In [None]:
print(json_content.keys())

In [None]:
print(json_content['fiscalYearEnd'])

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [7]:
cik = '0001652044'

In [8]:
def make_edgar_request(url):
    """
    Make a request to EDGAR (Electronic Data Gathering, Analysis and Retrieval)
    :param url: request URL
    :return: response
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br",
    }
    return requests.get(url, headers=headers)

In [9]:
def download_financial_data(cik):
    """
    Download financial data for a company.
    Upsert document on mongodb (each requests returns the entire history)
    :param cik: company cik
    :return:
    """
    url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    response = make_edgar_request(url)
    
    try:
        r = response.json()
        r["_id"] = cik
        r["url"] = url
        data=r
        
    # ETFs, funds, trusts do not have financial information
    except:
        print(f"ERROR {cik} - {response} - {url}")
        print(company_from_cik(cik))
    return data

In [11]:
data=download_financial_data(cik)

In [12]:
data

{'cik': 1652044,
 'entityName': 'Alphabet Inc.',
 'facts': {'dei': {'EntityPublicFloat': {'label': 'Entity Public Float',
    'description': "The aggregate market value of the voting and non-voting common equity held by non-affiliates computed by reference to the price at which the common equity was last sold, or the average bid and asked price of such common equity, as of the last business day of the registrant's most recently completed second fiscal quarter.",
    'units': {'USD': [{'end': '2015-06-30',
       'val': 0,
       'accn': '0001652044-16-000012',
       'fy': 2015,
       'fp': 'FY',
       'form': '10-K',
       'filed': '2016-02-11'},
      {'end': '2015-06-30',
       'val': 0,
       'accn': '0001652044-16-000022',
       'fy': 2015,
       'fp': 'FY',
       'form': '8-K',
       'filed': '2016-05-03',
       'frame': 'CY2015Q2I'},
      {'end': '2016-06-30',
       'val': 413800000000,
       'accn': '0001652044-17-000008',
       'fy': 2016,
       'fp': 'FY',
    

In [13]:
# Normalize the 'facts' field in the data dictionary
df_facts = pd.json_normalize(data['facts'], sep='_')

# Display the DataFrame
df_facts

Unnamed: 0,dei_EntityPublicFloat_label,dei_EntityPublicFloat_description,dei_EntityPublicFloat_units_USD,us-gaap_AccountsPayableCurrent_label,us-gaap_AccountsPayableCurrent_description,us-gaap_AccountsPayableCurrent_units_USD,us-gaap_AccountsReceivableNetCurrent_label,us-gaap_AccountsReceivableNetCurrent_description,us-gaap_AccountsReceivableNetCurrent_units_USD,us-gaap_AccruedIncomeTaxesCurrent_label,...,us-gaap_NetIncomeLossAvailableToCommonStockholdersDiluted_units_USD,us-gaap_UndistributedEarningsLossAvailableToCommonShareholdersBasic_label,us-gaap_UndistributedEarningsLossAvailableToCommonShareholdersBasic_description,us-gaap_UndistributedEarningsLossAvailableToCommonShareholdersBasic_units_USD,us-gaap_WeightedAverageNumberOfDilutedSharesOutstanding_label,us-gaap_WeightedAverageNumberOfDilutedSharesOutstanding_description,us-gaap_WeightedAverageNumberOfDilutedSharesOutstanding_units_shares,us-gaap_WeightedAverageNumberOfSharesOutstandingBasic_label,us-gaap_WeightedAverageNumberOfSharesOutstandingBasic_description,us-gaap_WeightedAverageNumberOfSharesOutstandingBasic_units_shares
0,Entity Public Float,The aggregate market value of the voting and n...,"[{'end': '2015-06-30', 'val': 0, 'accn': '0001...","Accounts Payable, Current",Carrying value as of the balance sheet date of...,"[{'end': '2014-12-31', 'val': 1715000000, 'acc...","Accounts Receivable, after Allowance for Credi...","Amount, after allowance for credit loss, of ri...","[{'end': '2014-12-31', 'val': 9383000000, 'acc...","Accrued Income Taxes, Current",...,"[{'start': '2023-01-01', 'end': '2023-06-30', ...",Undistributed Earnings (Loss) Available to Com...,Amount of undistributed earnings (loss) alloca...,"[{'start': '2023-01-01', 'end': '2023-06-30', ...","Weighted Average Number of Shares Outstanding,...",The average number of shares or units issued a...,"[{'start': '2023-01-01', 'end': '2023-06-30', ...","Weighted Average Number of Shares Outstanding,...","Number of [basic] shares or units, after adjus...","[{'start': '2023-01-01', 'end': '2023-06-30', ..."


In [14]:
# Function to extract year-wise data
def extract_yearly_data(df, column_prefix):
    yearly_data = []
    for index, row in df.iterrows():
        for col in df.columns:
            if col.startswith(column_prefix) and isinstance(row[col], list):
                for entry in row[col]:
                    entry_data = entry.copy()
                    entry_data['year'] = entry_data['end'][:4]
                    entry_data['label'] = col
                    yearly_data.append(entry_data)
    return pd.DataFrame(yearly_data)

# Extract year-wise data for 'us-gaap' columns
df_yearly_us_gaap = extract_yearly_data(df_facts, 'us-gaap')

# Display the new DataFrame
df_yearly_us_gaap

Unnamed: 0,end,val,accn,fy,fp,form,filed,year,label,frame,start
0,2014-12-31,1.715000e+09,0001652044-15-000005,2015,Q3,10-Q,2015-10-29,2014,us-gaap_AccountsPayableCurrent_units_USD,,
1,2014-12-31,1.715000e+09,0001652044-16-000012,2015,FY,10-K,2016-02-11,2014,us-gaap_AccountsPayableCurrent_units_USD,,
2,2014-12-31,1.715000e+09,0001652044-16-000022,2015,FY,8-K,2016-05-03,2014,us-gaap_AccountsPayableCurrent_units_USD,CY2014Q4I,
3,2015-09-30,1.549000e+09,0001652044-15-000005,2015,Q3,10-Q,2015-10-29,2015,us-gaap_AccountsPayableCurrent_units_USD,CY2015Q3I,
4,2015-12-31,1.931000e+09,0001652044-16-000012,2015,FY,10-K,2016-02-11,2015,us-gaap_AccountsPayableCurrent_units_USD,,
...,...,...,...,...,...,...,...,...,...,...,...
16660,2024-06-30,1.249500e+10,0001652044-24-000079,2024,Q2,10-Q,2024-07-24,2024,us-gaap_WeightedAverageNumberOfDilutedSharesOu...,CY2024Q2,2024-04-01
16661,2023-06-30,1.272500e+10,0001652044-24-000079,2024,Q2,10-Q,2024-07-24,2023,us-gaap_WeightedAverageNumberOfSharesOutstandi...,,2023-01-01
16662,2023-06-30,1.266800e+10,0001652044-24-000079,2024,Q2,10-Q,2024-07-24,2023,us-gaap_WeightedAverageNumberOfSharesOutstandi...,CY2023Q2,2023-04-01
16663,2024-06-30,1.237900e+10,0001652044-24-000079,2024,Q2,10-Q,2024-07-24,2024,us-gaap_WeightedAverageNumberOfSharesOutstandi...,,2024-01-01


In [15]:
df_10k = df_yearly_us_gaap[df_yearly_us_gaap['form'] == '10-K']
df_10k

Unnamed: 0,end,val,accn,fy,fp,form,filed,year,label,frame,start
1,2014-12-31,1.715000e+09,0001652044-16-000012,2015,FY,10-K,2016-02-11,2014,us-gaap_AccountsPayableCurrent_units_USD,,
4,2015-12-31,1.931000e+09,0001652044-16-000012,2015,FY,10-K,2016-02-11,2015,us-gaap_AccountsPayableCurrent_units_USD,,
9,2015-12-31,1.931000e+09,0001652044-17-000008,2016,FY,10-K,2017-02-03,2015,us-gaap_AccountsPayableCurrent_units_USD,CY2015Q4I,
13,2016-12-31,2.041000e+09,0001652044-17-000008,2016,FY,10-K,2017-02-03,2016,us-gaap_AccountsPayableCurrent_units_USD,,
17,2016-12-31,2.041000e+09,0001652044-18-000007,2017,FY,10-K,2018-02-06,2016,us-gaap_AccountsPayableCurrent_units_USD,CY2016Q4I,
...,...,...,...,...,...,...,...,...,...,...,...
16618,2022-12-31,1.347500e+10,0001652044-24-000022,2023,FY,10-K,2024-01-31,2022,us-gaap_Depreciation_units_USD,CY2022,2022-01-01
16621,2023-12-31,1.194600e+10,0001652044-24-000022,2023,FY,10-K,2024-01-31,2023,us-gaap_Depreciation_units_USD,CY2023,2023-01-01
16624,2021-12-31,2.500000e-02,0001652044-24-000022,2023,FY,10-K,2024-01-31,2021,us-gaap_EffectiveIncomeTaxRateReconciliationFd...,CY2021,2021-01-01
16625,2022-12-31,5.400000e-02,0001652044-24-000022,2023,FY,10-K,2024-01-31,2022,us-gaap_EffectiveIncomeTaxRateReconciliationFd...,CY2022,2022-01-01


In [16]:
# Group the 10-K filings by year
df_10k_yearly = df_10k.groupby('year')

# Display the grouped DataFrame
for year, group in df_10k_yearly:
    print(f"Year: {year}")
    print(group)
    print("\n")

Year: 2012
              end           val                  accn    fy  fp  form  \
1481   2012-12-31  1.477800e+10  0001652044-16-000012  2015  FY  10-K   
15759  2012-12-31  7.157000e+10  0001652044-16-000012  2015  FY  10-K   
16092  2012-12-31  1.907000e+09  0001652044-16-000012  2015  FY  10-K   

            filed  year                                              label  \
1481   2016-02-11  2012  us-gaap_CashAndCashEquivalentsAtCarryingValue_...   
15759  2016-02-11  2012               us-gaap_StockholdersEquity_units_USD   
16092  2016-02-11  2012          us-gaap_UnrecognizedTaxBenefits_units_USD   

      frame start  
1481    NaN   NaN  
15759   NaN   NaN  
16092   NaN   NaN  


Year: 2013
              end           val                  accn    fy  fp  form  \
473    2013-12-31  3.343000e+09  0001652044-16-000012  2015  FY  10-K   
560    2013-12-31  4.490000e+08  0001652044-16-000012  2015  FY  10-K   
568    2013-12-31  3.343000e+09  0001652044-16-000012  2015  FY  10-K  