Could be interesting to see the difference in parsing errors between
1. BINARY_PROMPT
2. YES_NO_PROMPT_TEMPLATE

In [1]:
import logging
import os
import pandas as pd
import sys
import re

from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

# Set logging for transformers
logging.basicConfig(level=logging.INFO)

# Adjust these paths according to your project structure if needed
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, project_root)

from src.analysis.prospectus_analyzer import ProspectusAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_file_path = '../data/prospectuses_data.csv'

# Load a small number of rows from the CSV, say 5 rows for demonstration.
df = pd.read_csv(raw_file_path).head(3)
df = df[~(df["Section ID"] == "failed parsing")]
df

Unnamed: 0,Prospectus ID,Original Filename,Section ID,Section Title,Subsection ID,Subsection Title,Subsubsection ID,Subsubsection Title,Subsubsection Text,Parsing Error,From Folder,Prospectus Year
1,5_1,Offerings 2020.pdf,1,RISK FACTORS,1.1,Risks related to the ADLER Group’s Business Ac...,1.1.1,Our business is significantly dependent on our...,We rely significantly on rental income. In the...,,raw_manual,2020
2,5_1,Offerings 2020.pdf,1,RISK FACTORS,1.1,Risks related to the ADLER Group’s Business Ac...,1.1.2,Our ability to operate our business successful...,We face the risk that we may not be able to ge...,,raw_manual,2020


In [3]:
questions_market_dynamics = {
    "Market Dynamics - a": "Does the text mention that the company is exposed to risks associated with cyclical products?",
    "Market Dynamics - b": "Does the text mention risks related to demographic or structural trends affecting the market?",
    "Market Dynamics - c": "Does the text mention risks due to seasonal volatility in the industry?"
}

model_id = "meta-llama/Llama-3.2-3B-Instruct"
# model_id = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model_hf = AutoModelForCausalLM.from_pretrained(model_id) #, pad_token_id=tokenizer.eos_token_id)
model_hf.generation_config.pad_token_id = tokenizer.pad_token_id
#quant_config = BitsAndBytesConfig(load_in_8bit=True)

# Different configurations we want to try
configurations = [
    {"max_new_tokens": 128},  # shortest response
    {"max_new_tokens": 256},  # short response
    {"max_new_tokens": 512}, # longer response
    {"max_new_tokens": 1024} # longer response
]

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.31s/it]


In [4]:
#print model configuration
print(model_hf.config)

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "meta-llama/Llama-3.2-3B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.47.0",
  "use_cache": true,
  "vocab_size": 128256
}



In [5]:
question = questions_market_dynamics["Market Dynamics - a"]

for i, config in enumerate(configurations):
    if i > 0:
        print("\n")
    print(f"=== Configuration {i} ===")
    print(config)

    pipe = pipeline(
        "text-generation",
        model=model_hf,
        # quantization_config=quant_config,
        tokenizer=tokenizer,
        max_new_tokens=config["max_new_tokens"]
        # ,batch_size=3,
    )

    llm_hf = HuggingFacePipeline(pipeline=pipe)
    analyzer_hf = ProspectusAnalyzer(llm_model=llm_hf)

    rows = df.to_dict('records')

    # Using batch call (with new debugging in the class):
    combined_answers = analyzer_hf.analyze_rows_relevance(rows, question)
    df[f"Answer_Config_{i}"] = combined_answers
    if i == 1:
        break


Device set to use mps:0


=== Configuration 0 ===
{'max_new_tokens': 128}


KeyboardInterrupt: 

In [None]:
df

Unnamed: 0,Prospectus ID,Original Filename,Section ID,Section Title,Subsection ID,Subsection Title,Subsubsection ID,Subsubsection Title,Subsubsection Text,Parsing Error,From Folder,Prospectus Year,Answer_Config_0,Answer_Config_1
1,5_1,Offerings 2020.pdf,1,RISK FACTORS,1.1,Risks related to the ADLER Group’s Business Ac...,1.1.1,Our business is significantly dependent on our...,We rely significantly on rental income. In the...,,raw_manual,2020,Highly Relevant: The exact phrases or sentence...,Highly Relevant: The exact phrases or sentence...
2,5_1,Offerings 2020.pdf,1,RISK FACTORS,1.1,Risks related to the ADLER Group’s Business Ac...,1.1.2,Our ability to operate our business successful...,We face the risk that we may not be able to ge...,,raw_manual,2020,Highly Relevant: The exact phrases or sentence...,Highly Relevant: The exact phrases or sentence...


In [None]:
df.Answer_Config_0.values

array(['Highly Relevant: The exact phrases or sentences from the document that support your assessment; otherwise, leave blank.',
       'Highly Relevant: The exact phrases or sentences from the document that support your assessment; otherwise, leave blank.'],
      dtype=object)