In [21]:
from edgar import set_identity
import pandas as pd
import time
from edgar import *

In [22]:
# must be set to comply with SEC regulations

set_identity("Neal Lockhart nhl3388@uncw.edu")

In [36]:
companies = ['HUBS', 'SNOW', 'CRWD', 'ZS', 'NET', 'AMD', 'NVDA', 'BASE', 'NOW', 'S']

### Risks

In [23]:
tenk = Company("HUBS").get_filings(form="10-K").latest(1)

In [24]:
sections = tenk.sections()

In [25]:
sections

['10-K',
 'UNITED STATES',
 'SECURITIES AND EXCHANGE COMMISSION\n Washington, D.C. 20549',
 'FORM 10-K',
 '(MARK ONE)',
 '☒   ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934',
 'FOR THE FISCAL YEAR ENDED DECEMBER 31, 2023',
 'OR',
 '☐   TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934',
 'FOR THE TRANSITION PERIOD FROM TO',
 'Commission File Number 001-36680\n \nHubSpot, Inc.\n (Exact name of registrant as specified in its charter)',
 'Delaware                                                        20-2632791                         \n(State or other jurisdiction ofincorporation or organization)   (I.R.S. EmployerIdentification No.)',
 'Two Canal Park',
 'Cambridge, Massachusetts, 02141\n (Address of principal executive offices)\n (888) 482-7768\n (Registrant’s telephone number, including area code)\n Securities registered pursuant to Section 12(b) of the Act:',
 'Title of each class                        Tra

In [5]:
def get_risk_section(companies: list):

    data = {
        "ticker": [],
        "chunked_risk": []
    }
    
    for company in companies:
        time.sleep(.1)
        tenk = Company(company).get_filings(form="10-K").latest(1)
        raw_chunked_tenk = tenk.sections()
        index_start = None
        index_end = None
        
        for i in range(len(raw_chunked_tenk)):
            if re.search(r"item 1a.\s*risk factors", raw_chunked_tenk[i].lower()):
                index_start=i
            elif re.search(r"item 1b.\s*unresolved", raw_chunked_tenk[i].lower()):
                index_end=i
                break

        if index_start is None or index_end is None:
            print("Warning: Could not find risk section for", company)
            continue
        
        risks = raw_chunked_tenk[index_start:index_end]

        for i in range(len(risks)):
            risks[i] = risks[i].lower()
            risks[i] = risks[i].replace("\n"," ")
            risks[i] = risks[i].replace("•", " ")
            risks[i] = risks[i].replace("table of contents","")
            
            data['ticker'].append(company)
            data['chunked_risk'].append(risks[i])

    return pd.DataFrame(data)

### Management's Discussion

In [40]:
def get_mgmt_discussion_section(companies: list):

    data = {
        "ticker": [],
        "chunked_mgmt_discussion": []
    }
    
    for company in companies:
        time.sleep(.1)
        tenk = Company(company).get_filings(form="10-K").latest(1)
        raw_chunked_tenk = tenk.sections()
        index_start = None
        index_end = None
        
        for i in range(len(raw_chunked_tenk)):
            if re.search(r"item 7.\s*management", raw_chunked_tenk[i].lower()):
                index_start=i
            elif re.search(r"item 7a.\s*(qualitative|quantitative)", raw_chunked_tenk[i].lower()):
                index_end=i
                break

        if index_start is None or index_end is None:
            print("Warning: Could not find management's discussion section for", company)
            continue
        
        mgmt_discussion = raw_chunked_tenk[index_start:index_end]

        for i in range(len(mgmt_discussion)):
            mgmt_discussion[i] = mgmt_discussion[i].lower()
            mgmt_discussion[i] = mgmt_discussion[i].replace("\n"," ")
            mgmt_discussion[i] = mgmt_discussion[i].replace("•", " ")
            mgmt_discussion[i] = mgmt_discussion[i].replace("table of contents","")
            mgmt_discussion[i] = " ".join(mgmt_discussion[i].split())
            
            data['ticker'].append(company)
            data['chunked_mgmt_discussion'].append(mgmt_discussion[i])

    return pd.DataFrame(data)

In [41]:
df = get_mgmt_discussion_section(companies)

In [42]:
df

Unnamed: 0,ticker,chunked_mgmt_discussion
0,HUBS,item 7. management’s discussion and analysis o...
1,HUBS,the following discussion and analysis of our f...
2,HUBS,company overview we provide a customer platfor...
3,HUBS,"our engagement hubs include marketing hub, sal..."
4,HUBS,global economic conditions our results of oper...
...,...,...
551,S,operating activities our largest source of ope...
552,S,"our purchase obligations as of january 31, 202..."
553,S,off-balance sheet arrangements we did not have...
554,S,the critical accounting policies requiring est...


In [43]:
df.to_csv("../data/mgmt_discussion.csv", index=False)