In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# Define the URL for the SEC EDGAR search page
url = "https://www.sec.gov/files/company_tickers_exchange.json"

headers = {
   "User-Agent": "your.email@email.com"
}
# Fetch the HTML content of the search page
response = requests.get(url,headers=headers)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

json_url = f"https://data.sec.gov/api/xbrl/companyconcept/CIK0001652044/us-gaap/AccountsPayableCurrent.json"
json_response = requests.get(json_url,headers=headers)
json_content = json_response.json()
print(json_content)
print(soup.prettify())

{'cik': 1652044, 'taxonomy': 'us-gaap', 'tag': 'AccountsPayableCurrent', 'label': 'Accounts Payable, Current', 'description': "Carrying value as of the balance sheet date of liabilities incurred (and for which invoices have typically been received) and payable to vendors for goods and services received that are used in an entity's business. Used to reflect the current portion of the liabilities (due within one year or within the normal operating cycle if longer).", 'entityName': 'Alphabet Inc.', 'units': {'USD': [{'end': '2014-12-31', 'val': 1715000000, 'accn': '0001652044-15-000005', 'fy': 2015, 'fp': 'Q3', 'form': '10-Q', 'filed': '2015-10-29'}, {'end': '2014-12-31', 'val': 1715000000, 'accn': '0001652044-16-000012', 'fy': 2015, 'fp': 'FY', 'form': '10-K', 'filed': '2016-02-11'}, {'end': '2014-12-31', 'val': 1715000000, 'accn': '0001652044-16-000022', 'fy': 2015, 'fp': 'FY', 'form': '8-K', 'filed': '2016-05-03', 'frame': 'CY2014Q4I'}, {'end': '2015-09-30', 'val': 1549000000, 'accn': 

In [5]:


# Parse the JSON content from the BeautifulSoup object
json_data = json.loads(soup.text)

# Convert the JSON data into a pandas DataFrame
df_cik_lookup = pd.DataFrame(json_data['data'], columns=json_data['fields'])


# df_cik_lookup['cik']=df_cik_lookup['cik'].astype(str).zfill(10)
df_cik_lookup

Unnamed: 0,cik,name,ticker,exchange
0,320193,Apple Inc.,AAPL,Nasdaq
1,1045810,NVIDIA CORP,NVDA,Nasdaq
2,789019,MICROSOFT CORP,MSFT,Nasdaq
3,1018724,AMAZON COM INC,AMZN,Nasdaq
4,1652044,Alphabet Inc.,GOOGL,Nasdaq
...,...,...,...,...
9941,2031561,Aldel Financial II Inc.,ALDFU,Nasdaq
9942,2032379,Willow Lane Acquisition Corp.,WLACU,Nasdaq
9943,2032950,Horizon Space Acquisition II Corp.,HSPTU,Nasdaq
9944,2039058,Jackson Acquisition Co II,JACS-UN,NYSE


In [None]:
json_data_list = []

# Define headers for the request
headers = {
    'User-Agent': 'Your Name (your_email@example.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'data.sec.gov'
}

# Loop through the first 100 CIKs
for i in range(100):
    cik = df_cik_lookup['cik'][i]
    cik=str(cik).zfill(10)
    print(cik)
    json_url = f"https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/AccountsPayableCurrent.json"
    json_response = requests.get(json_url, headers=headers)
    
    # Check if the request was successful
    if json_response.status_code == 200:
        json_content = json_response.json()
        json_data_list.append(json_content)
    else:
        print(f"Failed to fetch data for CIK: {cik}")

# Convert the list of JSON content into a DataFrame
df_json_content = pd.DataFrame(json_data_list)
df_json_content.to_csv('data.csv', index=False)


In [None]:
df_json_content

In [5]:
df_units=pd.json_normalize(df_json_content['units'])

In [None]:
df_units

In [None]:
# Define a function to extract the 'val' field from the JSON data
def extract_val(json_data):
    normalized_data = pd.json_normalize(json_data)
    filtered_data = normalized_data[normalized_data['form'] == '10-K']
    return filtered_data['val'].tolist()

def extract_filled(json_data):
    normalized_data = pd.json_normalize(json_data)
    filtered_data = normalized_data[normalized_data['form'] == '10-K']
    return filtered_data['end'].tolist()

# Apply the function to the 'USD' column and create a new column 'value'
df_json_content['value'] = df_units['USD'].apply(lambda x: extract_val(x) if isinstance(x, list) else [])
df_json_content['dates_filed'] = df_units['USD'].apply(lambda x: extract_filled(x) if isinstance(x, list) else [])

# Display the updated DataFrame
df_json_content


In [None]:

import matplotlib.pyplot as plt

# Plot graphs of value for the first 5 companies
for i in range(5):
    company_name = df_json_content['entityName'][i]
    values = df_json_content['value'][i]
    time = df_json_content['dates_filed'][i]
    
    plt.figure(figsize=(10, 5))
    plt.plot(time, values, marker='o')
    plt.title(f'Value Over Time for {company_name}')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.ylim(min(values) * 0.9, max(values) * 1.1)  # Set y-axis limits with some padding
    plt.grid(True)
    plt.show()

In [None]:
json_data_list = []

# Define headers for the request
headers = {
    'User-Agent': 'Your Name (your_email@example.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'data.sec.gov'
}

# Loop through the first 100 CIKs
for i in range(100):
    cik = df_cik_lookup['cik'][i]
    cik=str(cik).zfill(10)
    print(cik)
    json_url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    json_response = requests.get(json_url, headers=headers)
    
    # Check if the request was successful
    if json_response.status_code == 200:
        json_content = json_response.json()
        json_data_list.append(json_content)
    else:
        print(f"Failed to fetch data for CIK: {cik}")

# Convert the list of JSON content into a DataFrame
df_json_companyfacts = pd.DataFrame(json_data_list)
df_json_companyfacts.to_csv('data2.csv', index=False)

In [None]:
df_json_companyfacts

In [19]:
df_facts=pd.json_normalize(df_json_companyfacts['facts'])

In [None]:
df_facts

In [None]:
df_facts.columns

In [None]:
url="https://data.sec.gov/submissions/CIK0000822663.json"
# Fetch the JSON content from the URL
response = requests.get(url,headers=headers)
json_content = response.json()

# Convert JSON content to a string
json_str = json.dumps(json_content)

# Parse the string using BeautifulSoup
soup = BeautifulSoup(json_str, 'html.parser')

# Print the parsed content
print(soup.prettify())


In [None]:
print(json_content.keys())

In [None]:
print(json_content['fiscalYearEnd'])

In [1]:
import mongodb as mdb

mdb.get_mongodb_client()



MongoClient(host=['cluster0-shard-00-01.rrvyc.mongodb.net:27017', 'cluster0-shard-00-00.rrvyc.mongodb.net:27017', 'cluster0-shard-00-02.rrvyc.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=False, authsource='admin', replicaset='atlas-95o7b3-shard-0', tls=True)

In [2]:
mdb.download_cik_ticker_map()

JSONDecodeError: Expecting value: line 5 column 1 (char 4)

In [2]:
tiker = "GOOGL"
cik = mdb.cik_from_ticker(tiker)
cik

'0001652044'

In [5]:
mdb.download_submissions_documents(cik,"10-K",5)

form
form
2023-02-03 (10-K): https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm
form


In [4]:
mdb.download_all_cik_submissions(cik)

In [5]:
mdb.download_submissions_documents(cik,("10-k"),5)

In [2]:
import pandas as pd

In [3]:
df=pd.read_excel("transposed_df_v2 (3).xlsx")

In [4]:
df

Unnamed: 0,cik,entityName,Fiscal Year,Sector,Industry,Assets,CostsAndExpenses,GrossProfit,IncomeTaxExpenseBenefit,Liabilities,LiabilitiesAndStockholdersEquity,NetIncomeLoss,OperatingExpenses,ProfitLoss,PropertyPlantAndEquipmentGross,ResearchAndDevelopmentExpense,StockholdersEquity
0,4457,U-Haul Holding Company,2011,Consumer Discretionary,Rental/Leasing Companies,4191433000,1.808468e+09,1.418316e+09,34567000.0,3.198413e+09,4191433000,65623000.0,1.022061e+09,5.025663e+08,3.435980e+09,2.130717e+08,9.930200e+08
1,4457,U-Haul Holding Company,2013,Consumer Discretionary,Rental/Leasing Companies,4191433000,1.873664e+09,1.418316e+09,105739000.0,3.618231e+09,4654051000,183575000.0,1.026577e+09,5.025663e+08,3.787822e+09,2.130717e+08,1.035820e+09
2,4457,U-Haul Holding Company,2014,Consumer Discretionary,Rental/Leasing Companies,5306601000,2.059404e+09,1.418316e+09,143779000.0,4.471610e+09,5998978000,264708000.0,1.193934e+09,5.025663e+08,5.141717e+09,2.130717e+08,1.527368e+09
3,4457,U-Haul Holding Company,2015,Consumer Discretionary,Rental/Leasing Companies,5998978000,2.205038e+09,1.418316e+09,195131000.0,4.987816e+09,6872175000,342391000.0,1.313674e+09,5.025663e+08,6.047493e+09,2.130717e+08,1.884359e+09
4,4457,U-Haul Holding Company,2016,Consumer Discretionary,Rental/Leasing Companies,6855600000,2.411507e+09,1.418316e+09,204677000.0,5.857882e+09,8109288000,356741000.0,1.479409e+09,5.025663e+08,7.151244e+09,2.130717e+08,2.251406e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4681,1955520,Knife River Corporation,2023,Industrials,Mining & Quarrying of Nonmetallic Minerals (No...,2181824000,5.577896e+09,3.469490e+08,43459000.0,1.265730e+09,2294319000,129755000.0,7.829959e+08,2.260820e+08,4.367430e+09,5.960000e+05,8.782440e+08
4682,1957132,"SHARKNINJA, INC.",2023,Consumer Discretionary,Consumer Electronics/Appliances,3294891000,1.540417e+10,1.438184e+09,83213000.0,1.466602e+09,3294891000,331113000.0,9.999270e+08,5.025663e+08,3.442930e+09,2.006410e+08,1.457785e+09
4683,1967680,VERALTO CORPORATION,2023,Industrials,Electrical Products,4840000000,5.577896e+09,2.713000e+09,186000000.0,1.309400e+09,4825000000,861000000.0,6.382667e+08,-4.414758e+07,7.070000e+08,2.440000e+08,3.235000e+09
4684,1973266,"TKO GROUP HOLDINGS, INC.",2023,Consumer Discretionary,Services-Misc. Amusement & Recreation,3580730000,3.591167e+09,1.418316e+09,15769000.0,3.001906e+09,3580730000,-35227000.0,6.408070e+08,2.736250e+08,2.337480e+08,2.339243e+08,5.689160e+08


In [4]:
df['cik'] = df['cik'].apply(lambda x: mdb.add_trailing_to_cik(x))

In [6]:
cik=df['cik'].iloc[0]

In [7]:
cik

'0000004457'

In [8]:
for cik in df['cik']:
    mdb.download_submissions_documents(cik,("10-K"),5)

form
2024-05-30 (10-K): https://www.sec.gov/Archives/edgar/data/4457/000095017024066736/uhal-20240331.htm
form
2023-06-02 (10-K): https://www.sec.gov/Archives/edgar/data/4457/000000445723000052/uhal-20230331.htm
form
2022-05-25 (10-K): https://www.sec.gov/Archives/edgar/data/4457/000000445722000041/uhal-20220331.htm
form
2021-05-26 (10-K): https://www.sec.gov/Archives/edgar/data/4457/000000445721000040/uhal-20210331.htm
form
2020-05-27 (10-K): https://www.sec.gov/Archives/edgar/data/4457/000000445720000053/uhal-20200331.htm
form
2019-05-29 (10-K): https://www.sec.gov/Archives/edgar/data/4457/000000445719000024/march201910k.htm
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
2024-02-20 (10-K): https://www

In [None]:
mdb.download_submissions_documents()

In [9]:
cik

'0001652044'

In [12]:
filing_url = 'https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm'
doc = mdb.get_collection("submissions").find({"_id":filing_url }).next()
doc

StopIteration: 

In [18]:
import requests

url='https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm'
headers={
    "User-Agent": "your.email@email.com"
}
response= requests.get(url,headers)
r = response.text

In [1]:
import mongodb as mdb
import datetime

url='https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm'
mdb.download_document(url,cik,'10-K',None,None)

NameError: name 'cik' is not defined

In [3]:
r=response.text

In [4]:
r

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n<title>SEC.gov | Request Rate Threshold Exceeded</title>\n<style>\nhtml {height: 100%}\nbody {height: 100%; margin:0; padding:0;}\n#header {background-color:#003968; color:#fff; padding:15px 20px 10px 20px;font-family:Arial, Helvetica, sans-serif; font-size:20px; border-bottom:solid 5px #000;}\n#footer {background-color:#003968; color:#fff; padding:15px 20px;font-family:Arial, Helvetica, sans-serif; font-size:20px;}\n#content {max-width:650px;margin:60px auto; padding:0 20px 100px 20px; background-image:url(seal_bw.png);background-repeat:no-repeat;background-position:50% 100%;}\nh1 {font-family:Georgia, Times, serif; font-size:20px;}\nh2 {text-align:center; font-family:Georgia, Times, serif; font-size:20px; width:100%; border-bottom:solid #99

In [17]:
response

<Response [200]>

In [1]:
import mongodb as mdb

mdb.get_mongodb_client()

MongoClient(host=['cluster0-shard-00-02.rrvyc.mongodb.net:27017', 'cluster0-shard-00-01.rrvyc.mongodb.net:27017', 'cluster0-shard-00-00.rrvyc.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=False, authsource='admin', replicaset='atlas-95o7b3-shard-0', tls=True)

In [2]:
filing_url = 'https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm'
doc = mdb.get_collection("documents").find({"_id":filing_url }).next()
doc

{'_id': 'https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm',
 'html': '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n<title>SEC.gov | Request Rate Threshold Exceeded</title>\n<style>\nhtml {height: 100%}\nbody {height: 100%; margin:0; padding:0;}\n#header {background-color:#003968; color:#fff; padding:15px 20px 10px 20px;font-family:Arial, Helvetica, sans-serif; font-size:20px; border-bottom:solid 5px #000;}\n#footer {background-color:#003968; color:#fff; padding:15px 20px;font-family:Arial, Helvetica, sans-serif; font-size:20px;}\n#content {max-width:650px;margin:60px auto; padding:0 20px 100px 20px; background-image:url(seal_bw.png);background-repeat:no-repeat;background-position:50% 100%;}\nh1 {font-family:Georgia, Times, serif; font-size:20px;}\nh2 {

In [4]:
r=mdb.make_edgar_request(filing_url)

In [5]:
r.text

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n<title>SEC.gov | Request Rate Threshold Exceeded</title>\n<style>\nhtml {height: 100%}\nbody {height: 100%; margin:0; padding:0;}\n#header {background-color:#003968; color:#fff; padding:15px 20px 10px 20px;font-family:Arial, Helvetica, sans-serif; font-size:20px; border-bottom:solid 5px #000;}\n#footer {background-color:#003968; color:#fff; padding:15px 20px;font-family:Arial, Helvetica, sans-serif; font-size:20px;}\n#content {max-width:650px;margin:60px auto; padding:0 20px 100px 20px; background-image:url(seal_bw.png);background-repeat:no-repeat;background-position:50% 100%;}\nh1 {font-family:Georgia, Times, serif; font-size:20px;}\nh2 {text-align:center; font-family:Georgia, Times, serif; font-size:20px; width:100%; border-bottom:solid #99

In [1]:
import mongodb as mdb
import requests
filing_url='https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm'
headers={
    "User-Agent": "radnom@ten.edu",
    "Accept-Encoding": "gzip, deflate"
    }
r=requests.get(filing_url,headers=headers)

In [3]:
import mongodb as mdb
filing_url='https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm'

r=mdb.make_edgar_request(filing_url)

In [4]:
r.text

'<?xml version="1.0" ?><!--XBRL Document Created with Wdesk from Workiva--><!--Copyright 2023 Workiva--><!--r:94db13ab-d0fb-433a-a7d1-96ca74a2a87d,g:b8c6572a-40d9-4f2f-b3df-2328dc788b5b,d:a96e4fb0476549c99dc3a2b2368f643f--><html xmlns:country="http://xbrl.sec.gov/country/2022" xmlns:iso4217="http://www.xbrl.org/2003/iso4217" xmlns="http://www.w3.org/1999/xhtml" xmlns:ixt="http://www.xbrl.org/inlineXBRL/transformation/2020-02-12" xmlns:goog="http://www.google.com/20221231" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dei="http://xbrl.sec.gov/dei/2022" xmlns:ix="http://www.xbrl.org/2013/inlineXBRL" xmlns:srt="http://fasb.org/srt/2022" xmlns:xbrli="http://www.xbrl.org/2003/instance" xmlns:ixt-sec="http://www.sec.gov/inlineXBRL/transformation/2015-08-31" xmlns:us-gaap="http://fasb.org/us-gaap/2022" xmlns:link="http://www.xbrl.org/2003/linkbase" xmlns:xbrldi="http://xbrl.org/2006/xbrldi" xml:lang="en-US"><head><meta http-equiv="Conte

In [2]:
import mongodb as mdb
ticker="GOOGL"
cik = mdb.cik_from_ticker(ticker)
cik

'0001652044'

In [3]:
filing_url='https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm'
mdb.download_document(filing_url,cik,'10-K',None,None)

In [1]:
import mongodb as mdb

mdb.get_mongodb_client()

MongoClient(host=['cluster0-shard-00-01.rrvyc.mongodb.net:27017', 'cluster0-shard-00-00.rrvyc.mongodb.net:27017', 'cluster0-shard-00-02.rrvyc.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=False, authsource='admin', replicaset='atlas-95o7b3-shard-0', tls=True)

In [1]:
import pandas as pd
import mongodb as mdb

df=pd.read_excel("transposed_df_v2 (3).xlsx")
df['cik'] = df['cik'].apply(lambda x: mdb.add_trailing_to_cik(x))

In [2]:
for cik in df['cik'][:50]:
    mdb.download_submissions_documents(cik, ("10-K"), 5)

form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
form
2024-02-20 (10-K): https://www.sec.gov/Archives/edgar/data/5513/000000551324000015/unm-20231231.htm
form
2023-02-23 (10-K): https://www.sec.gov/Archives/edgar/data/5513/000000551323000034/unm-20221231.htm
form
2022-02-25 (10-K): https://www.sec.gov/Archives/edgar/data/5513/000000551322000030/unm-20211231.htm
form
2021-02-17 (10-K): https://www.sec.gov/Archives/edgar/data/5513/000000551321000015/unm-20201231.htm
form
2020-02-18 (10-K): https://www.sec.gov/Archives/edgar/data/5513/000000551320000027/unm-20191231.htm
form
2019-02-19 (10-K): https://www.sec.gov/Archives/edgar/data/5513/000000551319000004/unm12312018-10xk.htm
form
f

In [3]:
import mongodb as mdb

for doc in mdb.get_collection("documents").find():
    mdb.parse_document(doc)

form type: 		10-K
cik                     0000004457
name        U-Haul Holding Co /NV/
ticker                        UHAL
exchange                      NYSE
Name: 829, dtype: object
form type: 		10-K
cik                     0000004457
name        U-Haul Holding Co /NV/
ticker                        UHAL
exchange                      NYSE
Name: 829, dtype: object
form type: 		10-K
cik                     0000004457
name        U-Haul Holding Co /NV/
ticker                        UHAL
exchange                      NYSE
Name: 829, dtype: object
form type: 		10-K
cik                     0000004457
name        U-Haul Holding Co /NV/
ticker                        UHAL
exchange                      NYSE
Name: 829, dtype: object
form type: 		10-K
cik                     0000004457
name        U-Haul Holding Co /NV/
ticker                        UHAL
exchange                      NYSE
Name: 829, dtype: object
form type: 		10-K
cik                     0000004457
name        U-Haul Holding Co /N

In [None]:
import analyzer as an
for parsed_doc in mdb.get_collection("parsed_documents").find():
    an.sections_summary(parsed_doc)

business original_len: 25020 use gpt-3.5-turbo-16k w/ chain refine


  llm = ChatOpenAI(model_name=model, openai_api_key=parser.get("open_ai", "api_key"))
  res = chain.run(docs)


business original_len: 25020 summary_len: 1747 reduction: 93.02% cost: 0.021$ duration:8.0s used gpt-3.5-turbo-16k w/ chain refine
risk original_len: 82337 use gpt-3.5-turbo-16k w/ chain refine
risk original_len: 82337 summary_len: 2437 reduction: 97.04% cost: 0.07$ duration:16.4s used gpt-3.5-turbo-16k w/ chain refine
property original_len: 328 use gpt-3.5-turbo w/ chain refine
property original_len: 328 summary_len: 207 reduction: 36.89% cost: 0.0002$ duration:1.0s used gpt-3.5-turbo w/ chain refine
legal original_len: 272 use gpt-3.5-turbo w/ chain refine
legal original_len: 272 summary_len: 137 reduction: 49.63% cost: 0.0002$ duration:0.9s used gpt-3.5-turbo w/ chain refine
other original_len: 493 use gpt-3.5-turbo w/ chain refine
other original_len: 493 summary_len: 318 reduction: 35.5% cost: 0.0003$ duration:1.1s used gpt-3.5-turbo w/ chain refine

Total Cost: 0.09170000000000002$, Total duration: 27.5s
business original_len: 27877 use gpt-3.5-turbo-16k w/ chain refine
business o

In [1]:
import mongodb as mdb
filing_url = 'https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm'

doc = mdb.get_collection("documents").find({"_id":filing_url}).next()

In [2]:
mdb.parse_document(doc)

form type: 		10-K
cik            0001652044
name        Alphabet Inc.
ticker              GOOGL
exchange           Nasdaq
Name: 4, dtype: object


In [3]:
parsed_doc = mdb.get_collection("parsed_documents").find({"_id":filing_url}).next()

In [4]:
import analyzer as an

restructured_doc=an.restructure_parsed_10k(parsed_doc)

In [5]:
section_text = restructured_doc["business"]["text"]
section_text

'ITEM 1. BUSINESS Overview As our founders Larry and Sergey wrote in the original founders\' letter, "Google is not a conventional company. We do not intend to become one." That unconventional spirit has been a driving force throughout our history, inspiring us to tackle big problems and invest in moonshots, such as our long-term opportunities in artificial intelligence (AI). We continue this work under the leadership of Alphabet and Google CEO Sundar Pichai. Alphabet is a collection of businesses -- the largest of which is Google. We report Google in two segments, Google Services and Google Cloud; we also report all non-Google businesses collectively as Other Bets. Alphabet\'s structure is about helping each of our businesses prosper through strong leaders and independence. Access and technology for everyone The Internet is one of the world\'s most powerful equalizers; it propels ideas, people and businesses large and small. Our mission to organize the world\'s information and make it

In [6]:
len(section_text)

25020

In [7]:
chain_type = "refine"
model = "gpt-4o-mini"
verbose = True

# get summary from openAI model
print(f"business original_len: {len(section_text)} use {model} w/ chain {chain_type}")
summary, cost = an.summarize_section(section_text, model, chain_type, verbose)

business original_len: 25020 use gpt-4o-mini w/ chain refine


  llm = ChatOpenAI(model_name=model, openai_api_key=parser.get("open_ai", "api_key"))
  res = chain.run(docs)




[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"ITEM 1. BUSINESS Overview As our founders Larry and Sergey wrote in the original founders' letter, "Google is not a conventional company. We do not intend to become one." That unconventional spirit has been a driving force throughout our history, inspiring us to tackle big problems and invest in moonshots, such as our long-term opportunities in artificial intelligence (AI). We continue this work under the leadership of Alphabet and Google CEO Sundar Pichai. Alphabet is a collection of businesses -- the largest of which is Google. We report Google in two segments, Google Services and Google Cloud; we also report all non-Google businesses collectively as Other Bets. Alphabet's structure is about helping each of our businesses prosper through strong leaders and independence. Access and technology for everyone T

In [8]:
print(f"BULLET POINTS")
for el in summary:
    print(el)
print(f"cost: {cost} in USD")

BULLET POINTS
Alphabet Inc., led by CEO Sundar Pichai, remains committed to its founding mission of organizing the world's information and making it universally accessible
The company operates primarily through Google, which is divided into two main segments: Google Services and Google Cloud, along with a collection of other businesses termed "Other Bets." Alphabet emphasizes innovation, particularly in artificial intelligence (AI), which enhances core products like Google Search and YouTube and is integral to new solutions in Google Cloud
The company prioritizes privacy and security, continuously improving user control over their data.

As of December 31, 2022, Alphabet had 190,234 employees and supports protected labor rights through work councils and employee representation in certain countries
The company fosters a healthy and open dialogue with employees, utilizing multiple internal communication channels
Alphabet also contracts with specialized service providers for areas requiri

In [10]:
an.sections_summary(parsed_doc)

business original_len: 25020 use gpt-3.5-turbo-16k w/ chain refine
business original_len: 25020 summary_len: 1681 reduction: 93.28% cost: 0.0209$ duration:4.8s used gpt-3.5-turbo-16k w/ chain refine
risk original_len: 82337 use gpt-3.5-turbo-16k w/ chain refine
risk original_len: 82337 summary_len: 3234 reduction: 96.07% cost: 0.0716$ duration:18.7s used gpt-3.5-turbo-16k w/ chain refine
property original_len: 328 use gpt-3.5-turbo w/ chain refine
property original_len: 328 summary_len: 214 reduction: 34.760000000000005% cost: 0.0002$ duration:0.9s used gpt-3.5-turbo w/ chain refine
legal original_len: 272 use gpt-3.5-turbo w/ chain refine
legal original_len: 272 summary_len: 137 reduction: 49.63% cost: 0.0002$ duration:0.9s used gpt-3.5-turbo w/ chain refine
other original_len: 493 use gpt-3.5-turbo w/ chain refine
other original_len: 493 summary_len: 318 reduction: 35.5% cost: 0.0003$ duration:1.0s used gpt-3.5-turbo w/ chain refine

Total Cost: 0.0932$, Total duration: 26.6s


In [1]:
import mongodb as mdb
for cik in df['cik'][:50]:
    mdb.download_submissions_documents(cik, ("10-K"), 5)

NameError: name 'df' is not defined