# SEC Comments on Reporting Threshold for Institutional Investment Managers

**SAXA 3**

Mike Johnson | Kris Lederer | Sebastian Martinez | Ryan Mathis | Khushi Patel

In [47]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import re
import time
from datetime import datetime
import os
from urllib.parse import urlparse
import PyPDF2
from io import BytesIO
import pandas as pd

****

## 1.0 Issue a request for the webpage

In [11]:
url = "https://www.sec.gov/comments/s7-08-20/s70820.htm"

# SEC requires you to identify yourself with a proper User-Agent
# Format: "Company/App Name Admin Contact (email@example.com)"
headers = {
    'User-Agent': 'Georgetown University Mike Johnson mrj60@georgetown.edu'  # Replace with your actual info
}

response = requests.get(url, headers=headers)
print(f"HTTP Response Code: {response.status_code}")
print(f"Response Status: {response.reason}")

HTTP Response Code: 200
Response Status: OK


## 2.0 Get all comment URLS

In [None]:
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the links to comments (they're in <a> tags within the table)
    comments = []
    
    # The page structure has dates followed by commenter names with links
    # Look for all <a> tags that have href starting with /comments/
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('/comments/s7-08-20/'):
            commenter_name = link.get_text(strip=True)
            full_url = f"https://www.sec.gov{href}"
            
            parent = link.parent
            if parent:
                # Look for text before the link in the same parent
                text_content = parent.get_text()
                
            comments.append({
                'commenter': commenter_name,
                'url': full_url
            })
    
    # Create a DataFrame for easy viewing
    df = pd.DataFrame(comments)    
    
else:
    print(f"Error: {response.status_code}")

In [22]:
# Remove first two rows. These are not comments.
df = df.loc[2:, ]

# Remove memorandums. These are not comments.
df = df[~df['commenter'].str.contains('Memorandum', case = False)]

df

Unnamed: 0,commenter,url
2,Nathan B.,https://www.sec.gov/comments/s7-08-20/s70820-5...
3,Dave Cazza,https://www.sec.gov/comments/s7-08-20/s70820-5...
4,Brandon Smith,https://www.sec.gov/comments/s7-08-20/s70820-5...
5,N. Abanes,https://www.sec.gov/comments/s7-08-20/s70820-5...
6,Hussam Eldin Ahmed Elsheikh,https://www.sec.gov/comments/s7-08-20/s70820-1...
...,...,...
2318,James Finn,https://www.sec.gov/comments/s7-08-20/s70820-7...
2319,Benjamin J. Creasy,https://www.sec.gov/comments/s7-08-20/s70820-7...
2320,Bhavin Patel,https://www.sec.gov/comments/s7-08-20/s70820-7...
2321,"Markunas Michael Markunas, Chief Compliance Of...",https://www.sec.gov/comments/s7-08-20/s70820-7...


## 3.0 Extract Comments

In [42]:
# Define function to scrape text from each comment
def scrape_url_text(url, timeout=10):
    """Scrape text from a URL, handling both HTML and PDF formats."""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        
        content_type = response.headers.get('Content-Type', '').lower()
        
        if '.pdf' in url.lower() or 'application/pdf' in content_type:
            # Handle PDF
            pdf_file = BytesIO(response.content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ''
            for page in pdf_reader.pages:
                text += page.extract_text() + '\n'
            
            # Remove multiple newlines and replace with single space
            text = ' '.join(text.split())
            
            return text.strip()
        else:
            # Handle HTML - extract only <p> tags from body
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the body tag
            body = soup.find('body')
            
            # If no body tag exists, fall back to the entire document
            if body is None:
                body = soup
            
            # Find all <p> tags within the body
            paragraphs = body.find_all('p')
            
            # Extract text from each paragraph
            text_parts = []
            for p in paragraphs:
                # Remove script and style elements from each paragraph
                for script in p(["script", "style"]):
                    script.decompose()
                
                p_text = p.get_text(strip=True)
                if p_text:  # Only add non-empty paragraphs
                    text_parts.append(p_text)
            
            # Join all paragraphs with spaces
            text = ' '.join(text_parts)
            
            # Remove multiple newlines and replace with single space
            text = ' '.join(text.split())
            
            return text
            
    except Exception as e:
        return f"Error: {str(e)}"

In [48]:

def add_text_to_df(df, url_column='url', delay=1, save_checkpoint_every=100, checkpoint_file='df_checkpoint.csv'):
    """
    Add a 'text' column by scraping all URLs with checkpoint saving.
    
    Args:
        df: Your dataframe
        url_column: Name of column with URLs
        delay: Seconds between requests (1-2 recommended)
        save_checkpoint_every: Save progress every N rows
        checkpoint_file: Where to save checkpoints
    """
    # Add text column if it doesn't exist
    if 'text' not in df.columns:
        df['text'] = ''
    
    total = len(df)
    start_time = datetime.now()
    
    for idx, row in df.iterrows():
        # Skip if already scraped (useful for resuming)
        if df.at[idx, 'text'] and df.at[idx, 'text'] != '':
            continue
            
        url = row[url_column]
        print(f"Scraping {idx + 1}/{total}: {url}")
        df.at[idx, 'text'] = scrape_url_text(url)
        
        # Save checkpoint periodically
        if (idx + 1) % save_checkpoint_every == 0:
            df.to_csv(checkpoint_file, index=False)
            elapsed = datetime.now() - start_time
            estimated_total = elapsed / (idx + 1) * total
            remaining = estimated_total - elapsed
            print(f"✓ Checkpoint saved. Progress: {idx + 1}/{total}. Est. time remaining: {remaining}")
        
        # Delay between requests
        if idx < total - 1:
            time.sleep(delay)
    
    # Final save
    df.to_csv(checkpoint_file, index=False)
    print(f"\n✓ Complete! Scraped {total} URLs in {datetime.now() - start_time}")
    
    return df

In [49]:
# Get text
df = add_text_to_df(df, delay=1.5, save_checkpoint_every=100)

Scraping 3/2321: https://www.sec.gov/comments/s7-08-20/s70820-555642.htm
Scraping 4/2321: https://www.sec.gov/comments/s7-08-20/s70820-540942.htm
Scraping 5/2321: https://www.sec.gov/comments/s7-08-20/s70820-542682.htm
Scraping 6/2321: https://www.sec.gov/comments/s7-08-20/s70820-543042.htm
Scraping 7/2321: https://www.sec.gov/comments/s7-08-20/s70820-188839-358825.htm
Scraping 8/2321: https://www.sec.gov/comments/s7-08-20/s70820-326502.htm
Scraping 9/2321: https://www.sec.gov/comments/s7-08-20/s70820-326369.htm
Scraping 10/2321: https://www.sec.gov/comments/s7-08-20/s70820-20153230-320713.pdf
Scraping 11/2321: https://www.sec.gov/comments/s7-08-20/s70820-9131708-247287.htm
Scraping 12/2321: https://www.sec.gov/comments/s7-08-20/s70820-8547759-230713.htm
Scraping 13/2321: https://www.sec.gov/comments/s7-08-20/s70820-228724.htm
Scraping 14/2321: https://www.sec.gov/comments/s7-08-20/s70820-8186008-227162.htm
Scraping 15/2321: https://www.sec.gov/comments/s7-08-20/s70820-8135467-226560.h

unknown widths : 
[0, IndirectObject(31, 0, 2370155380272)]
unknown widths : 
[0, IndirectObject(34, 0, 2370155380272)]
unknown widths : 
[0, IndirectObject(37, 0, 2370155380272)]
unknown widths : 
[0, IndirectObject(40, 0, 2370155380272)]
unknown widths : 
[0, IndirectObject(43, 0, 2370155380272)]
unknown widths : 
[0, IndirectObject(46, 0, 2370155380272)]


Scraping 75/2321: https://www.sec.gov/comments/s7-08-20/s70820-223848.htm
Scraping 76/2321: https://www.sec.gov/comments/s7-08-20/s70820-223893.htm
Scraping 77/2321: https://www.sec.gov/comments/s7-08-20/s70820-7859977-223854.pdf
Scraping 78/2321: https://www.sec.gov/comments/s7-08-20/s70820-7860002-223862.pdf
Scraping 79/2321: https://www.sec.gov/comments/s7-08-20/s70820-7859999-223888.pdf
Scraping 80/2321: https://www.sec.gov/comments/s7-08-20/s70820-7860017-223887.pdf
Scraping 81/2321: https://www.sec.gov/comments/s7-08-20/s70820-7859978-223855.pdf
Scraping 82/2321: https://www.sec.gov/comments/s7-08-20/s70820-7859985-223860.htm
Scraping 83/2321: https://www.sec.gov/comments/s7-08-20/s70820-223850.htm
Scraping 84/2321: https://www.sec.gov/comments/s7-08-20/s70820-7860060-223901.pdf
Scraping 85/2321: https://www.sec.gov/comments/s7-08-20/s70820-7860047-223898.pdf
Scraping 86/2321: https://www.sec.gov/comments/s7-08-20/s70820-7860059-223900.pdf
Scraping 87/2321: https://www.sec.gov/co

Multiple definitions in dictionary at byte 0x7d for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x6ab4 for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x4f8d for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x360b for key /Subtype


Scraping 213/2321: https://www.sec.gov/comments/s7-08-20/s70820-7813710-223702.pdf
Scraping 214/2321: https://www.sec.gov/comments/s7-08-20/s70820-7813697-223698.htm
Scraping 215/2321: https://www.sec.gov/comments/s7-08-20/s70820-7813706-223678.pdf
Scraping 216/2321: https://www.sec.gov/comments/s7-08-20/s70820-7806897-223672.pdf
Scraping 217/2321: https://www.sec.gov/comments/s7-08-20/s70820-7806885-223668.pdf
Scraping 218/2321: https://www.sec.gov/comments/s7-08-20/s70820-7806899-223673.pdf
Scraping 219/2321: https://www.sec.gov/comments/s7-08-20/s70820-7807388-223697.pdf
Scraping 220/2321: https://www.sec.gov/comments/s7-08-20/s70820-7801790-223654.pdf
Scraping 221/2321: https://www.sec.gov/comments/s7-08-20/s70820-7801771-223648.pdf
Scraping 222/2321: https://www.sec.gov/comments/s7-08-20/s70820-7801797-223636.pdf
Scraping 223/2321: https://www.sec.gov/comments/s7-08-20/s70820-7801866-223639.pdf
Scraping 224/2321: https://www.sec.gov/comments/s7-08-20/s70820-7801859-223666.pdf
Scra

Multiple definitions in dictionary at byte 0xec2e4 for key /Info
Multiple definitions in dictionary at byte 0xec2f1 for key /Info
Multiple definitions in dictionary at byte 0xec2fe for key /Info


Scraping 325/2321: https://www.sec.gov/comments/s7-08-20/s70820-7787333-223529.pdf
Scraping 326/2321: https://www.sec.gov/comments/s7-08-20/s70820-7741244-223131.pdf
Scraping 327/2321: https://www.sec.gov/comments/s7-08-20/s70820-7717968-222993.pdf
Scraping 328/2321: https://www.sec.gov/comments/s7-08-20/s70820-7709057-222930.pdf
Scraping 329/2321: https://www.sec.gov/comments/s7-08-20/s70820-7717917-222989.pdf
Scraping 330/2321: https://www.sec.gov/comments/s7-08-20/s70820-7707475-222889.pdf
Scraping 331/2321: https://www.sec.gov/comments/s7-08-20/s70820-7705548-222848.htm
Scraping 332/2321: https://www.sec.gov/comments/s7-08-20/s70820-7697296-222767.htm
Scraping 333/2321: https://www.sec.gov/comments/s7-08-20/s70820-7700719-222807.htm
Scraping 334/2321: https://www.sec.gov/comments/s7-08-20/s70820-7700718-222787.htm
Scraping 335/2321: https://www.sec.gov/comments/s7-08-20/s70820-7700720-222788.htm
Scraping 336/2321: https://www.sec.gov/comments/s7-08-20/s70820-7697303-222771.htm
Scra

Multiple definitions in dictionary at byte 0x660b4 for key /Info
Multiple definitions in dictionary at byte 0x660c1 for key /Info
Multiple definitions in dictionary at byte 0x660ce for key /Info


Scraping 383/2321: https://www.sec.gov/comments/s7-08-20/s70820-584215-1684362.pdf


Multiple definitions in dictionary at byte 0x61663 for key /Info
Multiple definitions in dictionary at byte 0x61670 for key /Info
Multiple definitions in dictionary at byte 0x6167d for key /Info


Scraping 384/2321: https://www.sec.gov/comments/s7-08-20/s70820-7662255-222507.htm
Scraping 385/2321: https://www.sec.gov/comments/s7-08-20/s70820-222489.htm
Scraping 386/2321: https://www.sec.gov/comments/s7-08-20/s70820-7662257-222490.htm
Scraping 387/2321: https://www.sec.gov/comments/s7-08-20/s70820-7657665-222469.htm
Scraping 388/2321: https://www.sec.gov/comments/s7-08-20/s70820-7653751-222408.pdf
Scraping 389/2321: https://www.sec.gov/comments/s7-08-20/s70820-222396.htm
Scraping 390/2321: https://www.sec.gov/comments/s7-08-20/s70820-7653432-222369.htm
Scraping 391/2321: https://www.sec.gov/comments/s7-08-20/s70820-222387.htm
Scraping 392/2321: https://www.sec.gov/comments/s7-08-20/s70820-222378.htm
Scraping 393/2321: https://www.sec.gov/comments/s7-08-20/s70820-7653447-222375.htm
Scraping 394/2321: https://www.sec.gov/comments/s7-08-20/s70820-7653446-222374.htm
Scraping 395/2321: https://www.sec.gov/comments/s7-08-20/s70820-7653433-222370.pdf
Scraping 396/2321: https://www.sec.g

In [None]:
# Drop duplicate column
df = df.drop(columns = ['comment_text'])

df

Unnamed: 0,commenter,url,text
2,Nathan B.,https://www.sec.gov/comments/s7-08-20/s70820-5...,"Aug. 20, 2023 Dear SEC, I am writing in OPPOSI..."
3,Dave Cazza,https://www.sec.gov/comments/s7-08-20/s70820-5...,"Aug. 15, 2023 Dear SEC: As a U.S. household in..."
4,Brandon Smith,https://www.sec.gov/comments/s7-08-20/s70820-5...,"Aug. 15, 2023 Hi, I am a household investor, a..."
5,N. Abanes,https://www.sec.gov/comments/s7-08-20/s70820-5...,"Aug. 15, 2023 As an international and individu..."
6,Hussam Eldin Ahmed Elsheikh,https://www.sec.gov/comments/s7-08-20/s70820-1...,"May 14, 2023 I am opposed to this regulation c..."
...,...,...,...
2318,James Finn,https://www.sec.gov/comments/s7-08-20/s70820-7...,"July 11, 2020 The proposed increase in the thr..."
2319,Benjamin J. Creasy,https://www.sec.gov/comments/s7-08-20/s70820-7...,"July 11, 2020 As an active retail investor, I ..."
2320,Bhavin Patel,https://www.sec.gov/comments/s7-08-20/s70820-7...,"July 11, 2020 There should not no actual reaso..."
2321,"Markunas Michael Markunas, Chief Compliance Of...",https://www.sec.gov/comments/s7-08-20/s70820-7...,"July 10, 2020 On behalf of the B. Riley family..."


In [None]:
# Noticing some columns that have no text. These appear to be instances where the comment is a scanned pdf. 
# Unless we want to incorporate some OCR. We can likely skip these.

# Let's get a count to understand the scope
(df['text'].str.len() == 0).sum()

np.int64(32)

## 4.0 Export to csv

In [54]:
df.to_csv('sec_comments.csv', index = False)