In [1]:
import pandas as pd
import numpy as np
import json
import openai
import os
%pip install chromadb
import chromadb
from chromadb.api.models.Collection import Collection
import sqlite3
%pip install torch
import torch
%pip install langchain-core
%pip install langchain-community
%pip install langchain-huggingface
%pip install langchain-chroma
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings



In [2]:
# Libraries for Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Define Inputs

# Define Functions

## Data Import

Define function to retrieve the dictionary of filings for the specified airline, year, and period

In [3]:
def load_filings(airline, year, period):
  json_file = f"/content/drive/MyDrive/ColabData/SEC_Filings/{airline}{year}{period}_filings.json"
  with open(json_file, "r", encoding="utf-8") as f:
      filings = json.load(f)
  return filings

## Create and Store Embeddings

Define a function to separate the filings into text and metadata

In [4]:
def split_metadata(meta):
    return {k: (str(v) if v is not None else "None") for k, v in meta.items() if k != "text"}

def prepare_texts_and_metadatas(filings):
    """
    Separate the filings into text and metadata.
    """
    texts = [doc["text"] for doc in filings]
    metadatas = [split_metadata(doc) for doc in filings]

    return texts, metadatas

Define a function to initialize ChromaDB

In [5]:
import chromadb

def initialize_chromadb(persist_directory):
    """
    Initialize an ephemeral (in-memory, non-persistent) ChromaDB client.
    The database will not be stored on disk.
    """
    client = chromadb.PersistentClient(path=persist_directory)
    return client

Define function to proccess and load documents into a Chroma database

In [6]:
#from langchain_huggingface import HuggingFaceEmbeddings
#from langchain_chroma import Chroma
#from langchain_core.documents import Document
from chromadb.utils import embedding_functions

def embeddings_collection(filings, persist_directory, collection_name, batch_size=100):
    """
    Create embeddings and a Chroma vectorstore (persistent, stored on disk) and add documents with metadata.
    """
    # Separate the filings into text and metadata
    texts, metadatas = prepare_texts_and_metadatas(filings)

    # Define embedding function
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

    # Initialize ChromaDB client (ephemeral/in-memory)
    client = initialize_chromadb(persist_directory)

    # Create a collection
    if collection_name in [c.name for c in client.list_collections()]:
        client.delete_collection(name=collection_name)
    collection = client.create_collection(name=collection_name)


    # Add documents to the collection
    for i in range(0, len(texts), batch_size):
        # Split the texts and metadatas into batches
        batch_texts = texts[i:i+batch_size]
        batch_metadatas = metadatas[i:i+batch_size]
        # Generate embeddings for the batch and create unique IDs
        batch_embeddings = embedding_function(batch_texts)
        batch_ids = [meta.get("id", f"doc_{i+j}") for j, meta in enumerate(batch_metadatas)]
        collection.add(
            ids=batch_ids,
            embeddings=batch_embeddings,
            documents=batch_texts,
            metadatas=batch_metadatas
        )
        print(f"Added batch {i//batch_size + 1} of {(len(texts)+batch_size-1)//batch_size} batches ({len(batch_texts)} documents).")
    print(f"Added {len(texts)} documents to the collection.")

    return collection

## Retrieve Relevant Documents from Entire Filing Library with RAG

Define function to convert Chroma collection to a vectorstore for retrieval

In [7]:
def get_retriever(persist_directory, collection_name, k):
    # Use the same embedding model as used for ChromaDB
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    # Initialize LangChain Chroma vectorstore pointing to the persistent directory and collection
    vectorstore = Chroma(
        persist_directory=persist_directory,
        collection_name=collection_name,
        embedding_function=embedding_function
    )
    return vectorstore.as_retriever(search_kwargs={"k": k})

Define function to retrieve relevant filings

In [8]:
def retrieve_relevant_filings(query, persist_directory, collection_name):
    # Load the collection to get the count
    import chromadb
    client = initialize_chromadb(persist_directory)
    collection = client.get_collection(name=collection_name)
    k = max(1, int(0.05 * collection.count()))  # At least 1 result but limited to 5% of the total number of documents
    retriever = get_retriever(persist_directory, collection_name, k)
    docs = retriever.invoke(query)
    return [doc.page_content for doc in docs]

Define function to create start and end dates

In [30]:
import calendar
from datetime import datetime
from dateutil.relativedelta import relativedelta
def define_period_dates(year, period):
    # Create date components based on selected year and period
    if period == "FY":
        start_month = 1
        end_month = 12
    else:
        end_month = int(period[-1]) * 3
        start_month = end_month - 2
    start_day = 1
    end_day = calendar.monthrange(year, end_month)[1]
    # Create start and end date variables to constrain document scraping
    start_date = datetime(year, start_month, start_day)
    if period=="FY":
        end_date = (datetime(year, end_month, end_day) + relativedelta(months=2)) # add two months to the end date to capture annual filings that are released up to two months after end of period
    else:
        end_date = (datetime(year, end_month, end_day) + relativedelta(months=1) + relativedelta(days=1)) # add one month to the end date to capture quarterly filings that are released up to a month after end of period

    #Print messages when testing function operation
    #print(f"Start Date: {start_date}\nEnd Date: {end_date}")

    return start_date, end_date

## Generate Summary with LLM

In [None]:
openai.api_key = "INSERT API KEY HERE"

In [24]:
# Define function to use the OpenAI API to generate insights based on the most relevant portions of the retrieved filings
def summarize_sec_filings(airline, year, period, persist_directory, collection_name):
    # Using the retrieved relevant portions of the period's SEC filings, summarize key results using OpenAI GPT.
    start_date, end_date = define_period_dates(year, period)
    # Define overall query to guide relevant document retrieval and summarization
    query = f"{airline} {year}{period} financial results and operational highlights for the period ended {end_date}."
    # Retrieve relevant documents
    relevant_docs = retrieve_relevant_filings(query, persist_directory, collection_name)
    # Combine into a single string
    context = "\n\n".join(relevant_docs)
    context = context[:125000]  # add truncation or other transformations if needed to limit tokens passed to the API

    # Define the summarization prompt
    prompt = f"""
    You are an expert financial analyst summarizing SEC filings for {airline} from {year}{period}.
    Below are relevant filings for the query: {query}

    {context}

    Analyze all SEC filings , including all 10-Q, 10-K, 8-K filings, annual reports, and other filings.
    Provide the top insights for the year and period specified. Focus on the data from {year}{period} and ignore discussion of previous periods unless it provides meaningful context for current results. Provide up to 10 insights. Insights should be related to key developments in the following areas: financial, operational, commercial stratgy, labor, executive personnel, and route network. Do NOT include a topic if there is no relevant data or if there is nothing meaningful to report.
    Do NOT under any circumstances fabricate names, dates, or numerical figures. Ensure the values are present in the underlying data. A fabrication is content not present in the SEC filings including but not limited to any mention of 'John Doe' or 'Jane Doe'.
    Be sure to highlight any major events and their impacts and provide additional context.
    Format the response in a structured list format grouped by topic. Present insights in chronological order as best as possible. Length of each item should fully detail the insight while being easy to read and digest. Include relevant names when discussing personnel matters. Include accurate figures when discussing financial or other metrics.
    End the response with a single paragraph "Wrap Up".
    """

    # Send request to OpenAI GPT
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an expert financial analyst summarizing SEC filings and presenting them for public consumption. Accuracy is paramount, but you should provide interesting and revelatory insights. Language and style should be a cross between an investment analyst report and business media reporting."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3
    )

    return response.choices[0].message.content

# Complete Retrieval and Summary Generation Pipeline

Define the pipeline function.

In [25]:
def complete_summarization(airline, year, period, batch_size=100):
  # Load the filings with the load_filings function previously defined
  filings = load_filings(airline, year, period)
  # Define the directory to persist the vectorstore
  persist_directory = f"/content/drive/MyDrive/ColabData/Filings_Embeddings_Collections/{airline}{year}{period}_filings_collection"
  # Set collection name
  collection_name = "summaries"
  # Create the collection of documents and embeddings with the embeddings_collection function previously defined
  collection = embeddings_collection(filings, persist_directory, collection_name, batch_size)
  # Perform relevant document retrieval and summarization with the summarize_sec_filings function previously defined
  summary = summarize_sec_filings(airline, year, period, persist_directory, collection_name)
  return summary

In [26]:
import json

def insights_pipeline(airlines, years, periods, batch_size=100, output_file="/content/drive/MyDrive/ColabData/Summary_Dictionary/airline_financials_summaries.json"):


    # Check if the file exists and is empty, write an empty JSON array if so
    if not os.path.exists(output_file) or os.stat(output_file).st_size == 0:
        with open(output_file, 'w') as f:
            json.dump([], f)

    # Load existing summaries if the file exists
    with open(output_file, "r") as f:
        try:
            # Load existing data to append to
            airline_financials_summaries = json.load(f)
        except json.JSONDecodeError:
            # Handle case where file exists but is not valid JSON
            airline_financials_summaries = {}

    for airline in airlines:
        if airline not in airline_financials_summaries:
            airline_financials_summaries[airline] = {}

        for year in years:
            if str(year) not in airline_financials_summaries[airline]:
                airline_financials_summaries[airline][str(year)] = {}

            for period in periods:
                if period in airline_financials_summaries[airline][str(year)]:
                    print(f"Skipping {airline} {year}{period} (already summarized).")
                    continue  # Skip if already summarized

                print(f"Summarizing the {year}{period} filings for {airline}...")

                summary = complete_summarization(airline, year, period, batch_size)

                # Save into hierarchy
                airline_financials_summaries[airline][str(year)][period] = summary

                # Save to JSON after each summary
                with open(output_file, "w") as f:
                    json.dump(airline_financials_summaries, f, indent=2)

                print(f"Summary saved for the {year}{period} filings for {airline}.")

    return airline_financials_summaries


# Build Summary Dictionary

## Define Inputs

In [14]:
# Load the data from XLSX
airline_financials = pd.read_excel("/content/drive/MyDrive/ColabData/airline_financial_data.xlsx", sheet_name="airline_financials") # primary financial data and metrics

In [15]:
# Define the list of airlines
airlines = sorted(airline_financials["Airline"].unique())
airlines

['AAL', 'DAL', 'LUV', 'UAL']

In [16]:
# Define the list of years
years = sorted(airline_financials["Year"].unique())
years

[np.int64(2014),
 np.int64(2015),
 np.int64(2016),
 np.int64(2017),
 np.int64(2018),
 np.int64(2019),
 np.int64(2020),
 np.int64(2021),
 np.int64(2022),
 np.int64(2023),
 np.int64(2024),
 np.int64(2025)]

In [17]:
# Define the list of periods
periods = sorted(airline_financials["Quarter"].apply(lambda x: f"Q{x}" if x != "FY" else x).unique())
periods

['FY', 'Q1', 'Q2', 'Q3', 'Q4']

In [18]:
# Identify the airline, year, and period to summarize
airline = ["UAL"]
year = [2025]
period = ["Q2"]

In [19]:
# Set the batch size for embedding and adding documents (default size is 100)
#batch_size = 100

## Run the Pipeline

In [31]:
airline_financials_summaries = insights_pipeline(airline, year, period)

Summarizing the 2025Q2 filings for UAL...
Added batch 1 of 2 batches (100 documents).
Added batch 2 of 2 batches (76 documents).
Added 176 documents to the collection.
Summary saved for the 2025Q2 filings for UAL.


In [32]:
airline_financials_summaries["UAL"]["2025"]["Q2"]

"### Financial Insights\n\n1. **Second Quarter Earnings Performance**: United Airlines reported an adjusted diluted earnings per share (EPS) of $3.25 to $4.25 for Q2 2025, indicating a robust performance amid a challenging macroeconomic environment. This aligns with the company's guidance provided in the Investor Update, which reflects a stable demand scenario. The company anticipates adjusted diluted EPS of $11.50 to $13.50 for the full year under stable conditions, while a potential recession could lower this to $7.00 to $9.00.\n\n2. **Operating Cash Flow**: For the first half of 2025, United generated net cash from operating activities of $5.9 billion, slightly up from $5.7 billion in the same period in 2024. This demonstrates the airline's ability to maintain strong cash generation capabilities despite external pressures.\n\n3. **Capital Expenditures**: United's adjusted total capital expenditures for 2025 are projected to be less than $6.5 billion, reflecting a disciplined approac

In [None]:
import json
output_file="/content/drive/MyDrive/ColabData/Summary_Dictionary/airline_financials_summaries.json"
with open(output_file, "r") as f:
    test = json.load(f)

In [None]:
test["UAL"]["2025"]["Q2"]