In [1]:
from langchain.llms import HuggingFacePipeline
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import sys
import os
import pandas as pd
import torch

notebook_dir = os.getcwd()  # Current working directory of the notebook
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.insert(0, project_root)

from src.analysis.prospectus_analyzer import ProspectusAnalyzer

DATA_DIR = Path("../data")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from huggingface_hub import login
login(token="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")
!huggingface-cli whoami

pierrehogenhaug


In [None]:
# Initialize the Hugging Face LLM
model_id = "meta-llama/Llama-3.2-3B-Instruct"  
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
model.generation_config.pad_token_id = tokenizer.pad_token_id

model_2 = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", torch_dtype=torch.float16)

# Create a text-generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=0.1,
)

pipe_2 = pipeline(
    "text-generation",
    model=model_2,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=0.1,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.99s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:30<00:00, 15.16s/it]


In [None]:
# Initialize the LLM with the pipeline
llm = HuggingFacePipeline(pipeline=pipe)

# Initialize the analyzer with the new LLM
analyzer = ProspectusAnalyzer(llm_model=llm)

# Load the data
raw_file_path = DATA_DIR / "prospectuses_data.csv"

# Check if the raw file exists
if os.path.exists(raw_file_path):
    df_LLM = pd.read_csv(raw_file_path)
    # Filter out rows that have "failed parsing" in the Section ID column
    df_LLM = df_LLM[df_LLM['Section ID'] != "failed parsing"]
else:
    print("Raw data file not found.")

  llm = HuggingFacePipeline(pipeline=pipe)


In [6]:
# Print the model data type (fp precision)
print(f"Model data type (precision): {model.dtype}")
print(f"Model 2 data type (precision): {model_2.dtype}")

# Estimate memory footprint
param_size = sum(p.numel() for p in model.parameters())  # Total number of parameters
param_size2 = sum(p.numel() for p in model_2.parameters())  # Total number of parameters

param_memory = param_size * torch.tensor([], dtype=model.dtype).element_size()  # Memory in bytes
param_memory2 = param_size2 * torch.tensor([], dtype=model_2.dtype).element_size()  # Memory in bytes

param_memory_mb = param_memory / (1024 ** 2)  # Memory in MB
param_memory_mb2 = param_memory2 / (1024 ** 2)  # Memory in MB

print(f"Model parameter count: {param_size}")
print(f"Model parameter count 2: {param_size2}")

print(f"Estimated memory footprint: {param_memory_mb:.2f} MB")
print(f"Estimated memory footprint 2: {param_memory_mb2:.2f} MB")

Model data type (precision): torch.float32
Model 2 data type (precision): torch.float16
Model parameter count: 3212749824
Model parameter count 2: 3212749824
Estimated memory footprint: 12255.67 MB
Estimated memory footprint 2: 6127.83 MB


In [8]:
# Prepare the questions
questions_market_dynamics = {
    "Market Dynamics - a": "Does the text mention that the company is exposed to risks associated with cyclical products?",
    "Market Dynamics - b": "Does the text mention risks related to demographic or structural trends affecting the market?",
    "Market Dynamics - c": "Does the text mention risks due to seasonal volatility in the industry?"
}

specified_columns = list(questions_market_dynamics.keys())

# Ensure the columns exist in the dataframe
for column_name in specified_columns:
    if column_name not in df_LLM.columns:
        df_LLM[column_name] = ""
    df_LLM[column_name] = df_LLM[column_name].astype(str)
    
# For testing, let's process a few rows
num_rows_to_test = 3  # Adjust this number as needed

# Process and display the outputs
for index, row in df_LLM.head(num_rows_to_test).iterrows():
    print(f"\nProcessing row {index}...\n")
    for column_name, question in questions_market_dynamics.items():
        # Process the question
        combined_answer = analyzer.analyze_row_single_question(row, question)
        df_LLM.at[index, column_name] = combined_answer
        print(f"Question: {question}")
        print(f"Answer: {combined_answer}\n")

# Display the DataFrame with the new columns
df_LLM.head(num_rows_to_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Processing row 0...



KeyboardInterrupt: 