## Import Libraries

In [39]:
!pip install langchain-groq



In [40]:
!pip install faiss-cpu



In [41]:
pip install -U langchain-community



## Import Libraries

In [85]:
import json
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, posexplode, udf, explode
from pyspark.sql.types import ArrayType, DoubleType, StringType
from pyspark.sql import functions as F
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_groq import ChatGroq
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

In [96]:
# Set your Groq API key
import os
os.environ["GROQ_API_KEY"] =  "Pass your API Key"

## Read the dataset

In [87]:
# Create Spark session
spark = SparkSession.builder.appName("GENAI-Task").getOrCreate()

In [98]:
# Define path
path = '.\GenAI\Gen-AI-Data.csv'


df = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .option("multiLine", True) \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("mode", "PERMISSIVE") \
    .csv(path)

# Show the first few rows
df.show()

+----------------+-------+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        filename|    cik|year|           section_1|          section_1A|          section_1B|           section_2|           section_3|           section_4|           section_5|           section_6|           section_7|          section_7A|           section_8|           section_9|          section_9A|          section_9B|          section_10|          section_11|          section_12|          section_13|          section_14|          section_15|
+----------------+-------+----+--------------------+--------------------+--------------------+--

## Unpivoted version of Orginal DataFrame -  where each row represents one section of a 10-K document.

In [89]:
# Define section columns from section_1 to section_15 (including 9A, 9B)
section_cols = [f"section_{i}" if i not in ['9A', '9B'] else f"section_{i}" for i in list(range(1, 9)) + ['9A', '9B'] + list(range(10, 16))]

# Explode sections into separate rows with section name and text
df_melted = df.select(
    col("filename"),
    col("cik"),
    col("year"),
    F.explode(F.array([
        F.struct(F.lit(c).alias("section_name"), col(c).alias("text")) for c in section_cols
    ])).alias("exploded")
).select(
    "filename", "cik", "year", "exploded.section_name", "exploded.text"
)
df_melted.show()

+----------------+-------+----+------------+--------------------+
|        filename|    cik|year|section_name|                text|
+----------------+-------+----+------------+--------------------+
|1566373_2018.htm|1566373|2018|   section_1|Item 1. Business\...|
|1566373_2018.htm|1566373|2018|   section_2|Item 2. Propertie...|
|1566373_2018.htm|1566373|2018|   section_3|Item 3. Legal Pro...|
|1566373_2018.htm|1566373|2018|   section_4|Item 4. Mine Safe...|
|1566373_2018.htm|1566373|2018|   section_5|Item 5. Market fo...|
|1566373_2018.htm|1566373|2018|   section_6|Item 6. Selected ...|
|1566373_2018.htm|1566373|2018|   section_7|Item 7. Managemen...|
|1566373_2018.htm|1566373|2018|   section_8|Item 8. Financial...|
|1566373_2018.htm|1566373|2018|  section_9A|Item 9A. Controls...|
|1566373_2018.htm|1566373|2018|  section_9B|Item 9B. Other In...|
|1566373_2018.htm|1566373|2018|  section_10|Item 10. Director...|
|1566373_2018.htm|1566373|2018|  section_11|Item 11. Executiv...|
|1566373_2

## Check Shape Of DataFrame

In [90]:
# Number of rows
num_rows = df_melted.count()

# Number of columns
num_cols = len(df_melted.columns)

print(f"Shape: ({num_rows}, {num_cols})")

Shape: (48, 5)


## Implementing chunking on all sections data

In [91]:
#  Define UDF to chunk list of words into 500-word chunks
def chunk_words(words, chunk_size=1000):
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

chunk_words_udf = udf(chunk_words, ArrayType(StringType()))

#  Tokenize text
df_tokenized = df_melted.withColumn("words", split(col("text"), " "))

#  Create chunks from words
df_chunked = df_tokenized.withColumn("chunks", chunk_words_udf(col("words")))

df_final = df_chunked.select(
    "filename", "cik", "year", "section_name",
    posexplode("chunks").alias("chunk_index", "chunk_text")
)

df_final.show()

# Number of rows
num_rows = df_final.count()

# Number of columns
num_cols = len(df_final.columns)

print(f"Shape: ({num_rows}, {num_cols})")

+----------------+-------+----+------------+-----------+--------------------+
|        filename|    cik|year|section_name|chunk_index|          chunk_text|
+----------------+-------+----+------------+-----------+--------------------+
|1566373_2018.htm|1566373|2018|   section_1|          0|Item 1. Business\...|
|1566373_2018.htm|1566373|2018|   section_1|          1|inarigivir 400mg ...|
|1566373_2018.htm|1566373|2018|   section_1|          2|clinical trial in...|
|1566373_2018.htm|1566373|2018|   section_1|          3|interferon recept...|
|1566373_2018.htm|1566373|2018|   section_1|          4|serum HBV RNA, ar...|
|1566373_2018.htm|1566373|2018|   section_1|          5|will evaluate in ...|
|1566373_2018.htm|1566373|2018|   section_1|          6|modulating the ho...|
|1566373_2018.htm|1566373|2018|   section_1|          7|Europe, Hong Kong...|
|1566373_2018.htm|1566373|2018|   section_1|          8|any of our compou...|
|1566373_2018.htm|1566373|2018|   section_1|          9|other co

## LLM RAG Implementation to extract Attributes

In [92]:
# Embeddings
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

template = """
You are a financial analyst. Use the following 10-K report chunks to answer the question briefly and precisely.
Respond in a few words only. No sentences, no explanations.

Query: {query}

Chunk 1:
{chunk_1}

Chunk 2:
{chunk_2}

Chunk 3:
{chunk_3}

Answer (few words only):
"""


def run_qa_batch_with_chunks(year: int, questions: dict, template , embedding_function):
    """
    Runs a batch of RAG-based LLM queries using FAISS for chunk retrieval.

    Parameters:
    - year: int - 10-K filing year to filter on
    - questions: dict - key: label, value: actual question string
    - model: SentenceTransformer

    Returns:
    - dict - Each question label maps to {question, answer, top_chunks}
    """

    # Load data
    chunks_pd = df_final.select("filename", "cik", "year", "section_name", "chunk_index", "chunk_text").toPandas()
    chunks_pd = chunks_pd[chunks_pd["year"] == year]

    if chunks_pd.empty:
        return {"error": f"No data found for year {year}"}

    # Create augmented text
    chunks_pd["augmented_text"] = chunks_pd["cik"].astype(str) + "-" + chunks_pd["section_name"] + ": " + chunks_pd["chunk_text"]

    # Convert to LangChain documents

    documents = [
        Document(
            page_content=row["augmented_text"],
            metadata={
                "filename": row["filename"],
                "section_name": row["section_name"],
                "chunk_index": row["chunk_index"]
            }
        )
        for _, row in chunks_pd.iterrows()
    ]



    vectorstore = FAISS.from_documents(documents, embedding_function)

    # Initialize LLM

    llm = ChatGroq(model="llama3-70b-8192", temperature=0.0, max_tokens=400)

    prompt = PromptTemplate(
        input_variables=["query", "chunk_1", "chunk_2", "chunk_3"],
        template=template
    )

    chain = LLMChain(llm=llm, prompt=prompt)

    # Run all questions
    results = {}

    for label, query in questions.items():
        docs = vectorstore.similarity_search(query, k=3)

        if len(docs) < 3:
            results[label] = {
                "question": query,
                "answer": "Not enough relevant chunks retrieved.",
                "top_chunks": [doc.page_content for doc in docs]
            }
            continue

        answer = chain.run({
            "query": query,
            "chunk_1": docs[0].page_content,
            "chunk_2": docs[1].page_content,
            "chunk_3": docs[2].page_content
        })

        results[label] = {
            "question": query,
            "answer": answer.strip(),
            "top_chunks": [doc.page_content for doc in docs]
        }

    return results

## Extract Attributes from 2020 10K Filings

In [95]:
questions = {
    "Business of the company": "What is the business of the company as mentioned in the 10-K filing?",
    "Legal proceedings": "What legal proceedings has the company disclosed?",
    "Risk factors": "What are the principal risk factors for investors?",
    "Internal controls": "What did management conclude about the effectiveness of internal controls over financial reporting?",
    "Dividend status": "Does the company pay dividends on its common stock?"
}

# Run for 2020 filings
results = run_qa_batch_with_chunks(year=2020, questions=questions, template=template, embedding_function=embedding_function)

# Print
print(json.dumps(results, indent=2))

{
  "Business of the company": {
    "question": "What is the business of the company as mentioned in the 10-K filing?",
    "answer": "Banking and financial services",
    "top_chunks": [
      "718413-section_8: Item 8. Financial Statements and Supplementary Data\nThe audited consolidated financial statements and related notes of Community Bancorp. and Subsidiary and the report thereon of the independent registered accounting firm of Berry Dunn McNeil & Parker, LLC are incorporated herein by reference from the 2020 Annual Report, filed as Exhibit 13 to this report.\nIn accordance with the regulatory relief available to smaller reporting companies in SEC Release Nos. 33-10513 and 34-83550, the Company has elected to present audited statements of income, comprehensive income, cash flows and changes in shareholders\u2019 equity for each of the preceding two, rather than three, fiscal years.\nItem 9.",
      "718413-section_7: Item 7. Management\u2019s Discussion and Analysis of Financia

## Extract Attributes from 2019 10k Filings

In [94]:
questions = {
    "Business of the company": "What is the business of the company as mentioned in the 10-K filing?",
    "Legal proceedings": "What legal proceedings has the company disclosed?",
    "Risk factors": "What are the principal risk factors for investors?",
    "Internal controls": "What did management conclude about the effectiveness of internal controls over financial reporting?",
    "Dividend status": "Does the company pay dividends on its common stock?"
}

# Run for 2019 filings
results = run_qa_batch_with_chunks(year=2019, questions=questions, template=template, embedding_function=embedding_function)

# Print
print(json.dumps(results, indent=2))

{
  "Business of the company": {
    "question": "What is the business of the company as mentioned in the 10-K filing?",
    "answer": "Diverse global agribusiness and transportation company.",
    "top_chunks": [
      "88121-section_15: Item 15. Exhibits, Financial Statement Schedules\n(a)\u2002List the following documents filed as a part of the report:\n1.\u2003Financial statements.\nThe financial statements are included in Item 8 of this Form 10-K.\n2.\u2003Financial statement schedules.\nAll schedules are omitted as the required information is not applicable or the information is presented in the consolidated financial statements or related consolidated notes.\n3.\u2003Exhibits.\n10.14*\nSeaboard Corporation Executive Officers\u2019 Bonus Policy (effective for 2018 and supersedes all policies). Incorporated herein by reference to Exhibit 10.17 of Seaboard\u2019s Form 10-K for the fiscal year ended December 31, 2018.\n10.15*\nSeaboard Corporation Executive Incentive Plan (effective

## Extract Attributes from 2018 10k Filings

In [93]:
questions = {
    "Business of the company": "What is the business of the company as mentioned in the 10-K filing?",
    "Legal proceedings": "What legal proceedings has the company disclosed?",
    "Risk factors": "What are the principal risk factors for investors?",
    "Internal controls": "What did management conclude about the effectiveness of internal controls over financial reporting?",
    "Dividend status": "Does the company pay dividends on its common stock?"
}
# Run for 2018 filings
results = run_qa_batch_with_chunks(year=2018, questions=questions, template=template, embedding_function=embedding_function)

# Print
print(json.dumps(results, indent=2))

{
  "Business of the company": {
    "question": "What is the business of the company as mentioned in the 10-K filing?",
    "answer": "Not mentioned.",
    "top_chunks": [
      "1566373-section_6: Item 6. Selected Financial Data.\nThe following selected consolidated financial data should be read together with our consolidated financial statements and accompanying notes and \u201cManagement\u2019s Discussion and Analysis of Financial Condition and Results of Operations\u201d appended to this Annual Report on Form 10-K. The selected consolidated financial data in this section are not intended to replace our consolidated financial statements and the related notes. Our historical results are not necessarily indicative of the results that may be expected in the future.\nThe selected consolidated statement of operations and consolidated balance sheet data for the years ended December 31, 2018 and 2017 are derived from our audited consolidated financial statements appended to this Annual Re