In [6]:
import logging
import os
import pandas as pd
import sys
import re

from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Set logging for transformers
logging.basicConfig(level=logging.INFO)

# Adjust these paths according to your project structure if needed
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, project_root)

from src.analysis.prospectus_analyzer import ProspectusAnalyzer


In [7]:
raw_file_path = '../data/prospectuses_data.csv'

# Load a small number of rows from the CSV, say 5 rows for demonstration.
df = pd.read_csv(raw_file_path).head(5)
df = df[~(df["Section ID"] == "failed parsing")]
df

Unnamed: 0,Prospectus ID,Original Filename,Section ID,Section Title,Subsection ID,Subsection Title,Subsubsection ID,Subsubsection Title,Subsubsection Text,Parsing Error,From Folder,Prospectus Year
1,5_1,Offerings 2020.pdf,1,RISK FACTORS,1.1,Risks related to the ADLER Group’s Business Ac...,1.1.1,Our business is significantly dependent on our...,We rely significantly on rental income. In the...,,raw_manual,2020
2,5_1,Offerings 2020.pdf,1,RISK FACTORS,1.1,Risks related to the ADLER Group’s Business Ac...,1.1.2,Our ability to operate our business successful...,We face the risk that we may not be able to ge...,,raw_manual,2020
3,5_1,Offerings 2020.pdf,1,RISK FACTORS,1.1,Risks related to the ADLER Group’s Business Ac...,1.1.3,We rely on our ability to identify potential r...,"As part of our strategy, we evaluate real esta...",,raw_manual,2020
4,5_1,Offerings 2020.pdf,1,RISK FACTORS,1.1,Risks related to the ADLER Group’s Business Ac...,1.1.4,Our business is dependent on regional real est...,"As of June 30, 2020, our real estate portfolio...",,raw_manual,2020


In [8]:
questions_market_dynamics = {
    "Market Dynamics - a": "Does the text mention that the company is exposed to risks associated with cyclical products?",
    "Market Dynamics - b": "Does the text mention risks related to demographic or structural trends affecting the market?",
    "Market Dynamics - c": "Does the text mention risks due to seasonal volatility in the industry?"
}

model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_hf = AutoModelForCausalLM.from_pretrained(model_id)
model_hf.generation_config.pad_token_id = tokenizer.pad_token_id

# Different configurations we want to try
configurations = [
    {"max_new_tokens": 128},  # shortest response
    {"max_new_tokens": 256},  # short response
    {"max_new_tokens": 512}, # longer response
    {"max_new_tokens": 1024} # longer response
]

Loading checkpoint shards: 100%|██████████| 2/2 [00:22<00:00, 11.32s/it]


In [9]:
question = questions_market_dynamics["Market Dynamics - a"]

for i, config in enumerate(configurations, start=1):
    print(f"=== Configuration {i} ===")
    print(config)

    pipe = pipeline(
        "text-generation",
        model=model_hf,
        tokenizer=tokenizer,
        max_new_tokens=config["max_new_tokens"]
    )

    llm_hf = HuggingFacePipeline(pipeline=pipe)
    analyzer_hf = ProspectusAnalyzer(llm_model=llm_hf)

    rows = df.to_dict('records')

    # If batch call is slow, try individual calls:
    # combined_answers = []
    # for idx, row in enumerate(rows, start=1):
    #     prompt = analyzer_hf.BASELINE_PROMPT.format(
    #         question=question,
    #         subsection_title=row['Subsubsection Title'],
    #         subsection_text=row['Subsubsection Text']
    #     )
    #     print(f"Sending prompt {idx} to the model...")
    #     start = time.time()
    #     response = llm_hf.generate([prompt])
    #     end = time.time()
    #     print(f"Prompt {idx} took {end - start:.2f}s")

    #     # Parse response (similar logic as in analyze_rows_single_question_yes_no)
    #     generation = response.generations[0][0].text
    #     print("Raw Generation:", generation)
    #     answer, evidence_list = analyzer_hf.extract_fields(generation, answer_key="Answer", evidence_key="Evidence")
    #     evidence = '; '.join(evidence_list)
    #     if answer.lower() == "yes" and evidence:
    #         combined_answer = f"Yes: {evidence}"
    #     elif answer.lower() == "yes":
    #         combined_answer = "Yes"
    #     elif answer.lower() == "no":
    #         combined_answer = "No"
    #     else:
    #         combined_answer = "Parsing Error"
    #     combined_answers.append(combined_answer)

    # Using batch call (with new debugging in the class):
    combined_answers = analyzer_hf.analyze_rows_single_question_yes_no(rows, question)
    df[f"Answer_Config_{i}"] = combined_answers


Device set to use mps:0


=== Configuration 1 ===
{'max_new_tokens': 128}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 