<a href="https://colab.research.google.com/github/olwynodpatterson/FYP/blob/main/10kForms_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Scraping 10k fillings

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time
import random

from google.colab import drive

# Function to retrieve a list of tickers from a given URL
def get_ticker_list(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    # Check if the request was successful
    if response.status_code == 200:
        tickers = response.text.splitlines()
        return tickers
    else:
        # Raise an exception if the request failed
        raise Exception(f"Failed to retrieve data. Status code: {response.status_code}")

# Function to choose a random CIK (Central Index Key) from the list of tickers
def choose_random_cik(tickers):
    # Choose a random ticker and associated CIK
    ticker, cik = random.choice(tickers).split('\t')
    return cik

# Function to get the URLs of 10-K filings for a given CIK
def get_10k_filing_urls(cik):
    time.sleep(3) # Delay to avoid overwhelming the server
    # Fetch the URLs for 10-K filings
    filings_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=10-K&dateb=&owner=exclude&count=10"
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(filings_url, headers=headers)
    # Check response status
    if response.status_code != 200:
        print("Error fetching filings:", response.status_code)
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', class_='tableFile2')
    if not table:
        return None

    # Parse the table to get the filing links
    for row in table.find_all('tr')[1:]:
        cols = row.find_all('td')
        if len(cols) > 1:
            documents_page_link = 'https://www.sec.gov' + cols[1].a['href']
            submission_text_file_link = documents_page_link.replace('-index.htm', '.txt')
            return submission_text_file_link  # Return the first link found

    return None

def dehtml(html_content):
    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract text from the parsed HTML
    text = soup.get_text()

    return text

# Mount Google Drive
drive.mount('/content/drive')

def download_file(url, cik, folder='/content/drive/My Drive/10k_filings'):
    # Define the Google Drive folder path for the specific CIK
    drive_folder_path = os.path.join(folder, cik)

    # Create the folder if it doesn't exist
    if not os.path.exists(drive_folder_path):
        os.makedirs(drive_folder_path)

    # Correct the file paths to include the CIK-specific folder
    html_file_path = os.path.join(drive_folder_path, f'{cik}_10k_form.html')
    text_file_path = os.path.join(drive_folder_path, f'{cik}_10k_form.txt')

    # Make a request to download the file
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code == 200:
        with open(html_file_path, 'wb') as html_file:
            for chunk in response.iter_content(chunk_size=8192):
                html_file.write(chunk)
        print(f"Downloaded HTML: {html_file_path}")

        # Read the HTML content
        with open(html_file_path, 'r', encoding='utf-8') as html_file:
            html_content = html_file.read()

        # Convert HTML to plain text
        plain_text = dehtml(html_content)

        # Save the plain text
        with open(text_file_path, 'w', encoding='utf-8') as text_file:
            text_file.write(plain_text)
        print(f"Converted to Text: {text_file_path}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")


# Main function to drive the program
def main():
    url = 'https://www.sec.gov/include/ticker.txt'
    tickers = get_ticker_list(url)
    i=0
    while i <2:
        cik =  choose_random_cik(tickers)
        print(f"Selected CIK: {cik}")
        filing_url = get_10k_filing_urls(cik)
        print(f"Filing URL: {filing_url}")
        if filing_url:
            download_file(filing_url, cik)
            time.sleep(1)  # Respectful delay between requests
            i += 1
        else:
            print(f"No 10-K filings found for CIK {cik}.")

if __name__ == "__main__":
    main()



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Selected CIK: 1818089
Filing URL: https://www.sec.gov/Archives/edgar/data/1818089/000110465923081092/0001104659-23-081092.txt
Downloaded HTML: /content/drive/My Drive/10k_filings/1818089/1818089_10k_form.html
Converted to Text: /content/drive/My Drive/10k_filings/1818089/1818089_10k_form.txt
Selected CIK: 711665
Filing URL: https://www.sec.gov/Archives/edgar/data/711665/000161577419005072/0001615774-19-005072.txt
Downloaded HTML: /content/drive/My Drive/10k_filings/711665/711665_10k_form.html
Converted to Text: /content/drive/My Drive/10k_filings/711665/711665_10k_form.txt


# Finding the reports and converting to PDF


In [None]:
from bs4 import BeautifulSoup
import os
import pdfkit  # Ensure pdfkit is installed

# Function to clean up HTML content
def clean_html(html_content):
    # Replace non-breaking spaces with regular spaces
    cleaned_content = html_content.replace(u'\xa0', u' ')
    return cleaned_content

# Directory containing your 10-K filings
base_directory_path = '/content/drive/My Drive/10k_filings'

# List of reports to search for
reports = [
    'Balance Sheet',
    'Cash Flows',
    'Statements of Operations',
    'Statement of Operations',
    'Statements of Changes in Stockholder’s Equity'
]

# Loop through all subdirectories in the base directory
for subdir in os.listdir(base_directory_path):
    subdir_path = os.path.join(base_directory_path, subdir)

    # Check if the path is a directory to avoid trying to open files directly
    if os.path.isdir(subdir_path):
        # Loop through all files in the subdirectory
        for filename in os.listdir(subdir_path):
            # Check if the file is an HTML file
            if filename.endswith('.html'):
                print(f"Processing {filename}")
                file_path = os.path.join(subdir_path, filename)

                # Open and read the file
                with open(file_path, 'r', encoding='utf-8') as file:
                    html_content = file.read()

                # Clean the HTML content
                cleaned_html = clean_html(html_content)

                # Parse the cleaned HTML content
                soup = BeautifulSoup(cleaned_html, 'html.parser')

                for report_name in reports:
                    # Find the <a> tag that contains the current report name
                    a_tag = soup.find('a', string=lambda text: text and report_name in text)
                    if a_tag and a_tag.has_attr('href'):
                        href = a_tag['href']

                        # Check if it's an internal link
                        if href.startswith('#'):
                            target_id = href[1:]  # Remove the '#' at the beginning
                            target_element = soup.find(id=target_id)

                            if target_element:
                                # Find the next table after the target element
                                next_table = target_element.find_next('table')
                                if next_table:
                                    # Convert just the table or the section containing the table to a string
                                    target_html_string = str(next_table)

                                    # Generate a dynamic PDF file path based on the HTML file name and report name
                                    pdf_file_name = f"{filename.replace('.html', '')}_{report_name.replace(' ', '_').lower()}.pdf"
                                    pdf_file_path = os.path.join(subdir_path, pdf_file_name)

                                    # Specify PDF options, including UTF-8 encoding
                                    options = {
                                        'encoding': "UTF-8"
                                    }

                                    # Convert the targeted HTML content to PDF with specified options
                                    pdfkit.from_string(target_html_string, pdf_file_path, options=options)
                                    print(f"{report_name} table saved to {pdf_file_path}")

                                else:
                                    print(f"No next table found for {report_name} in {filename}.")

                            else:
                                print(f"Target element not found for {report_name} in {filename}.")
                        else:
                            print(f"The link in {filename} for {report_name} is not an internal link.")


Processing 1005229_10k_form.html
Balance Sheet table saved to /content/drive/My Drive/10k_filings/1005229/1005229_10k_form_balance_sheet.pdf
Cash Flows table saved to /content/drive/My Drive/10k_filings/1005229/1005229_10k_form_cash_flows.pdf
Statements of Operations table saved to /content/drive/My Drive/10k_filings/1005229/1005229_10k_form_statements_of_operations.pdf
Processing 1850906_10k_form.html
Balance Sheet table saved to /content/drive/My Drive/10k_filings/1850906/1850906_10k_form_balance_sheet.pdf
Cash Flows table saved to /content/drive/My Drive/10k_filings/1850906/1850906_10k_form_cash_flows.pdf
Statements of Operations table saved to /content/drive/My Drive/10k_filings/1850906/1850906_10k_form_statements_of_operations.pdf
Processing 1516513_10k_form.html
Balance Sheet table saved to /content/drive/My Drive/10k_filings/1516513/1516513_10k_form_balance_sheet.pdf
Cash Flows table saved to /content/drive/My Drive/10k_filings/1516513/1516513_10k_form_cash_flows.pdf
Statements 

# Extracting the Tables to Text


In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.23.22-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.22 (from pymupdf)
  Downloading PyMuPDFb-1.23.22-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.23.22 pymupdf-1.23.22


In [None]:
import fitz  # PyMuPDF
import os

# Base directory containing your PDFs within subfolders
base_directory_path = '/content/drive/My Drive/10k_filings'

# Function to extract text from a PDF and save it as a text file
def extract_text_from_pdf(pdf_path, output_directory):
    try:
        # Open the PDF
        doc = fitz.open(pdf_path)

        # Read text from each page and combine
        text = ""
        for page in doc:
            text += page.get_text()

        # Generate a text file path based on the PDF file name
        text_file_path = os.path.join(output_directory, f"{os.path.basename(pdf_path).replace('.pdf', '')}.txt")

        # Save the extracted text to a file
        with open(text_file_path, 'w', encoding='utf-8') as text_file:
            text_file.write(text)
        print(f"Extracted text saved to {text_file_path}")
    except Exception as e:
        print(f"An error occurred while processing {pdf_path}: {e}")

# Walk through the directory structure
for root, dirs, files in os.walk(base_directory_path):
    for file in files:
        if file.lower().endswith('.pdf'):
            pdf_path = os.path.join(root, file)
            print(f"Processing {pdf_path}")
            extract_text_from_pdf(pdf_path, root)


Processing /content/drive/My Drive/10k_filings/1005229/1005229_10k_form_balance_sheet.pdf
Extracted text saved to /content/drive/My Drive/10k_filings/1005229/1005229_10k_form_balance_sheet.txt
Processing /content/drive/My Drive/10k_filings/1005229/1005229_10k_form_cash_flows.pdf
Extracted text saved to /content/drive/My Drive/10k_filings/1005229/1005229_10k_form_cash_flows.txt
Processing /content/drive/My Drive/10k_filings/1005229/1005229_10k_form_statements_of_operations.pdf
Extracted text saved to /content/drive/My Drive/10k_filings/1005229/1005229_10k_form_statements_of_operations.txt
Processing /content/drive/My Drive/10k_filings/1850906/1850906_10k_form_balance_sheet.pdf
Extracted text saved to /content/drive/My Drive/10k_filings/1850906/1850906_10k_form_balance_sheet.txt
Processing /content/drive/My Drive/10k_filings/1850906/1850906_10k_form_cash_flows.pdf
Extracted text saved to /content/drive/My Drive/10k_filings/1850906/1850906_10k_form_cash_flows.txt
Processing /content/drive

# Combining statements to one file

In [3]:
import os

def merge_text_files_in_directory(directory_path):
    # Extract CIK or a unique identifier from the directory path
    # This is a placeholder logic; you'll need to replace it with actual logic to extract the CIK
    cik = directory_path.split(os.sep)[-1]  # Assuming the last part of the directory path is the CIK or unique ID

    # Define the output file name based on the CIK
    output_file_name = f"{cik}_extracted_10k.txt"

    # Find all .txt files in the directory
    text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]

    # Skip directories without text files
    if not text_files:
        return

    # Path for the merged output file
    output_path = os.path.join(directory_path, output_file_name)

    with open(output_path, 'w', encoding='utf-8') as outfile:
        for fname in text_files:
            file_path = os.path.join(directory_path, fname)
            with open(file_path, 'r', encoding='utf-8') as infile:
                outfile.write(infile.read())
                # Add a newline between files' content for readability
                outfile.write("\n")

    print(f"Merged file created at {output_path}")

base_directory_path = '/content/drive/My Drive/10k_filings'

# Walk through the directory structure
for root, dirs, files in os.walk(base_directory_path):
    merge_text_files_in_directory(root)


Merged file created at /content/drive/My Drive/10k_filings/1005229/1005229_extracted_10k.txt
Merged file created at /content/drive/My Drive/10k_filings/1850906/1850906_extracted_10k.txt
Merged file created at /content/drive/My Drive/10k_filings/1516513/1516513_extracted_10k.txt
Merged file created at /content/drive/My Drive/10k_filings/1851194/1851194_extracted_10k.txt
Merged file created at /content/drive/My Drive/10k_filings/1043000/1043000_extracted_10k.txt


# Dataframe creation

In [4]:
import pandas as pd
import os

def create_dataframe_from_files(directory_path):
    data = []  # List to store the file names and their text contents

    # Walk through the directory and subdirectories
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            # Check if the file matches the pattern {cik}_extracted_10k.txt
            if file.endswith('_extracted_10k.txt'):
                file_path = os.path.join(root, file)
                # Open and read the content of the file
                with open(file_path, 'r', encoding='utf-8') as file_content:
                    text = file_content.read()
                    # Append the filename (without extension) and text to the list
                    data.append({'filename': os.path.splitext(file)[0], 'text': text})

    # Create a DataFrame from the list
    df = pd.DataFrame(data, columns=['filename', 'text'])
    return df

base_directory_path = '/content/drive/My Drive/10k_filings'
df = create_dataframe_from_files(base_directory_path)

# Display the DataFrame to verify
print(df.head())

# Optional: Save the DataFrame to a CSV file
df.to_csv('/content/drive/My Drive/10k_filings/extracted_10k_dataframe.csv', index=False)


                filename                                               text
0  1005229_extracted_10k   \nMarch 31,\n \n2023\n2022\n \n(In thousands,...
1  1850906_extracted_10k   \n \nDecember 31,\n \n \n \n2022\n  \n2021\n ...
2  1516513_extracted_10k  As of March 31,\n2023\n2022\nAssets\nCurrent a...
3  1851194_extracted_10k   \n \nDecember 31,\n \n \n \n2022\n  \n2021\n ...
4  1043000_extracted_10k   \nDecember 31,\n \n2022\n2021\nASSETS\nCurren...
