In [None]:
!pip install datasets

In [None]:
import re
import requests
import html
import torch
from huggingface_hub import login
import transformers
from datasets import Dataset
import textwrap
import os
from transformers import AutoTokenizer
from sqlalchemy import create_engine, MetaData, Table, insert

  from tqdm.autonotebook import tqdm, trange


# Extract MD&A Section

In [None]:
# @title Regex
MDA = re.compile(
    r'\bItem\s*7[\.\s]+Management[’\']s Discussion\s*and\s*Analysis\s*of\s*(?:(Financial\s*Condition\s*and\s*Results\s*of\s*Operations)|(Results\s*of\s*Operations\s*and\s*Financial\s*Condition))\b(.|\n)*?(?=\bItem\s*(8|7A))',
    re.IGNORECASE
)
TABLES_1 = re.compile(r'<TABLE.*?</TABLE>', re.I)
GARBAGE = {
    'ascii': re.compile(r'<DOCUMENT>.*?<TYPE>(GRAPHIC|ZIP|EXCEL|JSON|PDF|XML|EX).*?</DOCUMENT>', re.I),
    'ascii_alt': re.compile(r'<(GRAPHIC|ZIP|EXCEL|JSON|PDF|XML|EX).*?>.*?</\1>', re.I),
    'header_footer': re.compile(r'(</SEC-HEADER>|-----END PRIVACY-ENHANCED MESSAGE-----)', re.I),
    'html_tags': re.compile(r'<(div|font|tr|td|p|span).*?>|</(font|div|tr|td|p|span)>', re.I),
}
NON_BREAKING_ZERO_WIDTH = re.compile(r'\xa0|\u200b', re.I)
NORMALIZE_NEW_LINES = re.compile(r'(\n\s*){3,}', re.I)

TABLES_2 = re.compile(r'\[/?TABLE\]', re.I)
TOC_1 = re.compile(r'\nTable\s*of\s*Contents\n', re.I)
TOC_2 = re.compile(r'\nReturn\s*to\s*Table\s*of\s*Contents\n', re.I)
REPEATED_SYMBOLS = re.compile(r'(\_{2,}|\-{2,}|={2,})', re.I)
NORMALIZE_SPACES = re.compile(r'(\s{1,})', re.I)
SURROUNDING_SPACES = re.compile(r' (,|;|\.|’|®)  ', re.I)
HTML_COMMENTS = re.compile(r'^(.*?)" -->', re.I)
HTML_XML_TAGS = re.compile(r'<[^>]*>', re.I)

In [None]:
import html

def preprocess(content):

  txt = content
  for g in GARBAGE.values():
      txt = g.sub('\n', txt)
  txt = html.unescape(txt)
  txt = TABLES_1.sub("", txt)
  txt = TABLES_2.sub('\n', txt)
  txt = NON_BREAKING_ZERO_WIDTH.sub('\n', txt).strip()
  txt = NORMALIZE_NEW_LINES.sub('\n\n', txt).strip()
  if "Part II, Item" in txt:
    txt = txt.replace("Part II, Item", " ")
  if "Part\nII, Item\n8" in txt:
    txt = txt.replace("Part\nII, Item\n8", " ")
  if "Part 2, Item 8" in txt:
    txt = txt.replace("Part 2, Item 8", " ")
  if 'Item 8, "Financial Statements and Supplementary Data."' in txt:
    txt = txt.replace('Item 8, "Financial Statements and Supplementary Data."', " ")
  if '“Part II, Item 8: Financial Statements and Supplementary Data.”' in txt:
    txt = txt.replace('“Part II, Item 8: Financial Statements and Supplementary Data.”', " ")
  if 'Item 8. Financial Statements and Supplementary Data' in txt:
    txt = txt.replace('Item 8. Financial Statements and Supplementary Data', " ")

  print("10-K text preprocessed")

  return txt

In [None]:
def extract_mda(content):

    txt = preprocess(content)
    mda = ''
    txt = TABLES_2.sub('', txt)
    # print(txt)
    for match in MDA.finditer(txt):
        # print("match")
        if len(match.group(0)) > len(mda):
            mda = match.group(0)
    mda = TOC_1.sub(' ', mda)
    mda = TOC_2.sub(' ', mda)
    mda = REPEATED_SYMBOLS.sub(' ', mda)
    mda = NORMALIZE_SPACES.sub(' ', mda)
    mda = SURROUNDING_SPACES.sub(r'\1 ', mda)
    mda = HTML_COMMENTS.sub('', mda)
    mda = HTML_XML_TAGS.sub('', mda)

    print(f'\nMDA extracted')

    return mda

In [None]:
def extract_mda_from_10K(filings):

  mdas = []
  for i in range(len(filings)):
    url = filings["URLs"].iloc[i]
    response = requests.get(url, headers=headers)
    content = response.text
    mda = extract_mda(content)
    mdas.append(mda)

  return mdas

# Chunk and Summarize Text

In [None]:
# Define the function to chunk the text based on token count
def chunk_by_tokens(text, max_tokens=4000):
    # Tokenize the entire text
    tokens = tokenizer.encode(text)
    num_tokens = len(tokens)

    # Create chunks
    chunks = []
    start_idx = 0

    # Iterate over tokens and create chunks of the specified max_tokens size
    while start_idx < num_tokens:
        end_idx = min(start_idx + max_tokens, num_tokens)
        chunk_tokens = tokens[start_idx:end_idx]

        # Decode the chunk of tokens back into text
        chunk_text = tokenizer.decode(chunk_tokens)

        # Append the chunk to the list
        chunks.append(chunk_text)

        # Move to the next chunk
        start_idx = end_idx

    return chunks

In [None]:
def format_chat_template(document):
    instruction = "Please summarize the input document."
    row_json = [{"role": "user", "content": f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{document}\n\n### Response:\n"}]
    return row_json

In [None]:
def generate_summary(filepath):
  with open(filepath, "r") as f:
    content = f.read()
    f.close()
  chunks = chunk_by_tokens(content)
  num_chunks = len(chunks)
  # Summarize each chunk
  output_chunks = []
  i = 1
  for chunk in chunks:
      prompt = format_chat_template(chunk)
      output = pipe(prompt, max_new_tokens=round(500/num_chunks))
      output_chunks.append(output[0]["generated_text"][-1]["content"])
      torch.cuda.empty_cache()
      print(f"Processed {i}/{num_chunks} chunks.")
      i += 1

  # Decode and combine summaries
  final_summary = " ".join(output_chunks)

  return final_summary

In [None]:
def custom_wrap_text(text, max_line_length):
  lines = textwrap.wrap(text, width=max_line_length)
  return '\n'.join(lines)

In [None]:
# Function to upload summaries
def upload_summaries(company_id, year, summary_text):
    """
    Uploads a list of summaries to the database.

    :param summaries: A list of dictionaries containing summary details.
                      Example: [{'CompanyID': 1, 'Year': 2023, 'SummaryText': '...'}]
    """
    try:
        # Insert data into the table
        with engine.connect() as connection:
          with connection.begin():
            insert_stmt = insert(company_ten_k_summaries).values(
                CompanyID=company_id, Year=year, SummaryText=summary_text
            )

            # Execute the insertion
            connection.execute(insert_stmt)

        print("Summaries uploaded successfully.")
    except Exception as e:
        print(f"Error uploading summaries: {e}")

In [None]:
if __name__=="__main__":

  # Let's load the model and the tokenizer
  HF_token = "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"
  login(token=HF_token)

  model_id = "DISLab/SummLlama3.2-3B"
  pipe = transformers.pipeline(
      "text-generation",
      model=model_id,
      torch_dtype=torch.bfloat16,
      device_map="auto",
  )
  tokenizer = AutoTokenizer.from_pretrained(model_id)

  DATABASE_URL = "postgresql+psycopg2://u381r20ceebmb7:p2c1b3eb128bb09f92c43d005d55f54c36a4a0e5bd110945652252726dfdb6068@c3gtj1dt5vh48j.cluster-czrs8kj4isg7.us-east-1.rds.amazonaws.com:5432/d77oud95l1v4g6"
  engine = create_engine(DATABASE_URL)

  metadata = MetaData()
  company_info_table = Table('company_information', metadata, autoload_with=engine)
  company_ten_k_summaries = Table(
      "company_ten_k_summaries", metadata, autoload_with=engine
  )

  dir="../mdas_extra"   # path to extracted mdas


  year = input("What year are these documents from? ")

  for filename in os.listdir(dir):
    filepath = os.path.join(dir, filename)
    summary = generate_summary(filepath)
    summary_formatted = custom_wrap_text(summary, 100)
    ticker = filename.split("_")[0]
    query = select(company_info_table).where(company_info_table.c.Ticker == ticker)
    company_id = session.execute(query).fetchone()
    upload_summaries(company_id, year, summary_formatted)
    # print(summary_formatted)
    torch.cuda.empty_cache()



  torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]