<a href="https://colab.research.google.com/github/noodlesbug/minnat/blob/main/note2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Improved SEC MD&A Generator with Enhanced Evaluation
# This code generates Management's Discussion and Analysis (Section 7) based on Financial Statements (Section 8)
# with better evaluation metrics and interactive testing functionality

# Install necessary libraries
# If you're running in a new environment, uncomment the following lines
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes datasets
!pip install -U "transformers>=4.30.0"
!pip install rouge-score nltk scikit-learn

# Improved imports
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template

import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import TrainingArguments, TextStreamer
from trl import SFTTrainer
import random
import re
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from datetime import datetime

# 🔧 FIX: REMOVE OLD NLTK CACHE (in case 'punkt_tab' or other corruption exists)
import shutil
import os
nltk_data_path = os.path.join(os.path.expanduser("~"), "nltk_data")
if os.path.exists(nltk_data_path):
    shutil.rmtree(nltk_data_path)

# ✅ FIX: ENSURE CORRECT TOKENIZER IS DOWNLOADED
nltk.download('punkt')

# Download NLTK resources if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Verify GPU availability
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    print("GPU memory:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

# Function to load and preprocess datasets
def load_edgar_datasets(years=None):
    """
    Load multiple years of EDGAR data and combine them.
    """
    if years is None:
        years = ["year_2006", "year_2007", "year_2008", "year_2009", "year_2010"]

    print("Loading datasets from years:", years)
    combined_dataset = {"train": [], "validation": [], "test": []}

    # COMBINE DATA FROM MULTIPLE YEARS
    for year in years:
        try:
            print(f"Loading {year}...")
            edgar_dataset = load_dataset("eloukas/edgar-corpus", year)
            for split in ["train", "validation", "test"]:
                if split in edgar_dataset:
                    combined_dataset[split].extend(edgar_dataset[split])
        except Exception as e:
            print(f"Error loading {year}: {e}")

    # Convert to Dataset objects
    for split in combined_dataset:
        combined_dataset[split] = Dataset.from_list(combined_dataset[split])
        print(f"Combined {split} dataset size: {len(combined_dataset[split])}")

    return combined_dataset

# Function to preprocess the dataset with more lenient filters
def preprocess_dataset(dataset_split, min_section7_length=100, min_section8_length=50, max_length=8000):
    """
    Extract sections 7 and 8 and filter based on length constraints.
    """
    filtered_data = []

    for item in dataset_split:
        section7 = item.get('section_7', '')
        section8 = item.get('section_8', '')

        # LESS RESTRICTIVE FILTERING
        if (section7 and section8 and
            len(section7.strip()) >= min_section7_length and
            len(section8.strip()) >= min_section8_length and
            len(section7) <= max_length and
            len(section8) <= max_length):

            # CLEAN TEXT TO IMPROVE QUALITY
            section7 = section7.replace('\t', ' ').replace('\r', ' ')
            section8 = section8.replace('\t', ' ').replace('\r', ' ')

            filtered_data.append({
                'section_7': section7,
                'section_8': section8,
                'cik': item['cik'],
                'year': item['year']
            })

    return Dataset.from_list(filtered_data)

# IMPROVED PROMPT FORMATTING
def format_prompt(example):
    """Format with more specific instructions"""

    system_prompt = ("You are a financial analyst who specializes in SEC filings. Your task is to convert "
                    "financial statements to a comprehensive management's discussion and analysis that "
                    "analyzes financial condition, changes in financial condition, and results of operations.")

    user_prompt = f"""Financial Statements (Section 8):
{example['section_8']}

Generate a detailed Management's Discussion and Analysis (Section 7) that thoroughly explains the financial results, trends, and business conditions reflected in these statements. Only output the Section 7 content, with no additional commentary."""

    # The expected assistant response is the actual Section 7
    assistant_prompt = example['section_7']

    # Format based on Llama's chat template
    formatted_prompt = {
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": assistant_prompt}
        ]
    }

    return formatted_prompt

# Model configuration and loading
def setup_model(gpu_memory=None):
    """Set up the model with optimal configuration based on available GPU memory"""

    if gpu_memory is None and torch.cuda.is_available():
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9

    print(f"Available GPU memory: {gpu_memory:.2f} GB")

    # CHOOSE MODEL BASED ON AVAILABLE MEMORY
    if gpu_memory > 35:  # For A100 40GB or larger
        model_name = "meta-llama/Llama-2-7b-chat-hf"
        print(f"Using larger model: {model_name}")
    elif gpu_memory > 15:  # For V100 16GB or similar
        model_name = "NousResearch/Nous-Hermes-llama-2-7b"
        print(f"Using medium-sized model: {model_name}")
    else:  # For smaller GPUs
        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        print(f"Using small model due to memory constraints: {model_name}")

    try:
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=4096,
            load_in_4bit=True,    # Use 4-bit quantization to reduce memory usage
            device_map="auto",    # Let the library handle device mapping
            attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
        )
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        print("Falling back to TinyLlama model...")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            max_seq_length=4096,
            load_in_4bit=True,
            device_map="auto" if torch.cuda.is_available() else "cpu",
        )

    # IMPROVED LORA CONFIGURATION
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,               # Rank of LoRA adapters
        target_modules=[    # TARGET ALL PROJECTION MODULES
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
        lora_alpha=32,      # INCREASED ALPHA FOR BETTER ADAPTATION
        lora_dropout=0.05,  # SMALL DROPOUT FOR REGULARIZATION
        bias="none",
    )

    return model, tokenizer

# Function to format datasets with the chat template
def format_with_chat_template(example, tokenizer):
    """Apply the chat template to format the prompt."""
    formatted = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text": formatted}

# IMPROVED TRAINING ARGUMENTS
def get_training_args(val_dataset):
    """Configure optimal training arguments"""

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=5,                # SLIGHTLY MORE EPOCHS FOR BETTER LEARNING
        per_device_train_batch_size=1,     # Small batch size for memory constraints
        gradient_accumulation_steps=8,     # INCREASED ACCUMULATION STEPS FOR STABILITY
        warmup_ratio=0.1,                  # USE RATIO INSTEAD OF STEPS FOR BETTER SCALING
        learning_rate=1e-4,                # LOWER LEARNING RATE FOR STABILITY
        fp16=not is_bfloat16_supported(),  # Use fp16 if bfloat16 is not supported
        bf16=is_bfloat16_supported(),      # Use bfloat16 if supported
        logging_steps=5,
        optim="adamw_8bit",                # 8-bit optimizer for memory efficiency
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        save_strategy="epoch",             # SAVE AT EACH EPOCH
        load_best_model_at_end=True,       # LOAD BEST MODEL AT END BASED ON EVAL
        eval_strategy="epoch" if len(val_dataset) > 0 else "no", # Fixed parameter!
        report_to="none",                  # Disable wandb, tensorboard, etc.
    )

    return training_args

# Train the model
def train_model(model, tokenizer, train_dataset, val_dataset):
    """Train the model with optimal settings"""

    # Apply formatting to datasets
    train_dataset_formatted = train_dataset.map(format_prompt)
    val_dataset_formatted = val_dataset.map(format_prompt)

    # Apply chat template
    train_dataset_formatted = train_dataset_formatted.map(
        lambda x: format_with_chat_template(x, tokenizer)
    )
    val_dataset_formatted = val_dataset_formatted.map(
        lambda x: format_with_chat_template(x, tokenizer)
    )

    # Get training arguments
    training_args = get_training_args(val_dataset_formatted)

    # Setup trainer
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_formatted,
        eval_dataset=val_dataset_formatted if len(val_dataset_formatted) > 0 else None,
        dataset_text_field="text",         # Field that contains the formatted prompt
        max_seq_length=4096,
        tokenizer=tokenizer,
        packing=False,                     # Don't pack sequences
    )

    # Train the model
    print("Starting training...")
    start_time = datetime.now()
    train_result = trainer.train()
    end_time = datetime.now()
    training_duration = end_time - start_time
    print(f"Training completed in {training_duration}")

    # Save the fine-tuned model
    trainer.save_model("./edgar_llama_model")
    tokenizer.save_pretrained("./edgar_llama_tokenizer")

    return model, tokenizer, training_duration

# IMPROVED GENERATION FUNCTION
def generate_section7(model, tokenizer, example):
    """Generate Section 7 from Section 8 with improved clean output"""

    # Create the prompt with IMPROVED SYSTEM PROMPT
    system_prompt = "You are a financial analyst who specializes in SEC filings. Your task is to convert financial statements to a comprehensive management's discussion and analysis that analyzes financial condition, changes in financial condition, and results of operations."

    # BETTER USER PROMPT WITH CLEARER TASK SPECIFICATION
    user_prompt = f"""Financial Statements (Section 8):
{example['section_8']}

Generate a detailed Management's Discussion and Analysis (Section 7) that thoroughly explains the financial results, trends, and business conditions reflected in these statements. Only output the Section 7 content, with no additional commentary."""

    # Format messages for the model
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    # Apply chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate with improved parameters
    output = model.generate(
        **inputs,
        max_new_tokens=2048,       # INCREASED TOKEN LIMIT FOR LONGER OUTPUTS
        top_p=0.85,                # SLIGHTLY MORE FOCUSED SAMPLING
        top_k=50,                  # ADDED TOP-K SAMPLING
        temperature=0.7,
        do_sample=True,
        repetition_penalty=1.1,    # PREVENT REPETITIVE TEXT
        use_cache=True
    )

    # Decode and extract the response
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract only the Section 7 content - improved to handle various formats
    # First try to remove all the instruction prompts
    if '<assistant>' in decoded_output:
        response = decoded_output.split('<assistant>')[-1].strip()
    else:
        response = decoded_output

    # Remove any [INST] tags and system prompts
    response = re.sub(r'\[INST\].*?\[/INST\]', '', response, flags=re.DOTALL)

    # Clean up any remaining template artifacts
    response = re.sub(r'<<SYS>>.*?<</SYS>>', '', response, flags=re.DOTALL)

    # Remove any user prompt repeats
    if "Financial Statements (Section 8):" in response:
        response = response.split("Financial Statements (Section 8):")[0]

    # Clean up any final formatting issues
    response = response.strip()

    return response

# ENHANCED EVALUATION METRICS
def evaluate_prediction(prediction, reference):
    """
    Calculate multiple metrics to evaluate the generated Section 7 against the actual one.
    """
    metrics = {}

    # Tokenize for BLEU score calculation
    prediction_tokens = nltk.word_tokenize(prediction.lower())
    reference_tokens = nltk.word_tokenize(reference.lower())

    # Word overlap percentage
    pred_words = set(prediction_tokens)
    ref_words = set(reference_tokens)
    if len(ref_words) > 0:
        overlap = len(pred_words.intersection(ref_words)) / len(ref_words)
        metrics['word_overlap'] = overlap * 100  # as percentage
    else:
        metrics['word_overlap'] = 0

    # BLEU Score (with smoothing for short sentences)
    smoothie = SmoothingFunction().method1
    try:
        bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothie)
        metrics['bleu_score'] = bleu_score * 100  # as percentage
    except Exception as e:
        print(f"Error calculating BLEU score: {e}")
        metrics['bleu_score'] = 0

    # ROUGE Scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, prediction)
    metrics['rouge1_f'] = rouge_scores['rouge1'].fmeasure * 100
    metrics['rouge2_f'] = rouge_scores['rouge2'].fmeasure * 100
    metrics['rougeL_f'] = rouge_scores['rougeL'].fmeasure * 100

    # TF-IDF Cosine Similarity
    try:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([prediction, reference])
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        metrics['cosine_similarity'] = similarity * 100  # as percentage
    except Exception as e:
        print(f"Error calculating cosine similarity: {e}")
        metrics['cosine_similarity'] = 0

    # Length comparison (as percentage of reference length)
    metrics['length_ratio'] = (len(prediction) / max(1, len(reference))) * 100

    return metrics

# Test the model on multiple examples
def test_model_on_examples(model, tokenizer, test_dataset, num_examples=3):
    """Test the model on multiple examples with enhanced metrics."""
    all_metrics = []

    if len(test_dataset) < num_examples:
        num_examples = len(test_dataset)
        print(f"Warning: Only {num_examples} examples available for testing.")

    examples = random.sample(list(range(len(test_dataset))), num_examples)

    for i, idx in enumerate(examples):
        print(f"\n\n========== TESTING MODEL ON EXAMPLE #{i+1} ==========")
        example = test_dataset[idx]

        print("Input (Section 8 excerpt):")
        print(example['section_8'][:500] + "...\n")

        print("Generating prediction...")
        prediction = generate_section7(model, tokenizer, example)

        print("Predicted Section 7 (excerpt):")
        print(prediction[:500] + "...\n")

        print("Actual Section 7 (excerpt):")
        print(example['section_7'][:500] + "...\n")

        # Evaluate with enhanced metrics
        metrics = evaluate_prediction(prediction, example['section_7'])
        all_metrics.append(metrics)

        print("Evaluation Metrics:")
        print(f"Word Overlap: {metrics['word_overlap']:.2f}%")
        print(f"BLEU Score: {metrics['bleu_score']:.2f}")
        print(f"ROUGE-1 F1: {metrics['rouge1_f']:.2f}")
        print(f"ROUGE-2 F1: {metrics['rouge2_f']:.2f}")
        print(f"ROUGE-L F1: {metrics['rougeL_f']:.2f}")
        print(f"Cosine Similarity: {metrics['cosine_similarity']:.2f}%")
        print(f"Length Ratio: {metrics['length_ratio']:.2f}%")

    # Calculate and report average metrics
    if all_metrics:
        print("\n========== AVERAGE METRICS ==========")
        avg_metrics = {key: np.mean([m[key] for m in all_metrics]) for key in all_metrics[0].keys()}
        for key, value in avg_metrics.items():
            print(f"Average {key}: {value:.2f}")

    return all_metrics

# ENHANCED INTERACTIVE TEST
def interactive_test_with_cik(model, tokenizer, dataset, cik=None, year=None):
    """
    Interactive test function that allows searching by CIK and year.
    """
    print("\n\n========== INTERACTIVE TESTING WITH CIK AND YEAR ==========")
    print("You can test the model with specific company CIK and year.")

    while True:
        # If not provided, ask for CIK
        if cik is None:
            cik_input = input("\nEnter company CIK (or 'exit' to quit): ")
            if cik_input.lower() == 'exit':
                break
            try:
                cik = int(cik_input.strip())
            except ValueError:
                print("CIK should be a numeric value. Please try again.")
                continue

        # If not provided, ask for year
        if year is None:
            year_input = input("Enter year (2006-2010 or 'exit'): ")
            if year_input.lower() == 'exit':
                break
            try:
                year = int(year_input.strip())
                if year < 2006 or year > 2010:
                    print("Year should be between 2006 and 2010. Please try again.")
                    year = None
                    continue
            except ValueError:
                print("Year should be a numeric value. Please try again.")
                continue

        # Search for matching records
        matching_records = [item for item in dataset if item['cik'] == cik and item['year'] == year]

        if not matching_records:
            print(f"No data found for CIK {cik} in year {year}.")
        else:
            print(f"\nFound {len(matching_records)} record(s) for CIK {cik} in year {year}.")
            for i, record in enumerate(matching_records):
                print(f"\n--- Record {i+1} ---")

                # Get the Section 8 content
                section8 = record['section_8']
                print("Section 8 (excerpt):")
                print(section8[:300] + "...\n")

                # Generate Section 7
                example = {'section_8': section8}
                print("Generating Section 7...")
                prediction = generate_section7(model, tokenizer, example)

                print("Generated Section 7 (excerpt):")
                print(prediction[:300] + "...\n")

                # Compare with actual Section 7
                actual_section7 = record['section_7']
                print("Actual Section 7 (excerpt):")
                print(actual_section7[:300] + "...\n")

                # Calculate and display metrics
                metrics = evaluate_prediction(prediction, actual_section7)
                print("Evaluation Metrics:")
                print(f"Word Overlap: {metrics['word_overlap']:.2f}%")
                print(f"BLEU Score: {metrics['bleu_score']:.2f}")
                print(f"ROUGE-1 F1: {metrics['rouge1_f']:.2f}")
                print(f"ROUGE-2 F1: {metrics['rouge2_f']:.2f}")
                print(f"ROUGE-L F1: {metrics['rougeL_f']:.2f}")
                print(f"Cosine Similarity: {metrics['cosine_similarity']:.2f}%")
                print(f"Length Ratio: {metrics['length_ratio']:.2f}%")

                # Ask if user wants to see full text
                see_full = input("\nWould you like to see the full generated Section 7? (y/n): ")
                if see_full.lower() == 'y':
                    print("\n--- FULL GENERATED SECTION 7 ---")
                    print(prediction)

                see_actual = input("\nWould you like to see the full actual Section 7? (y/n): ")
                if see_actual.lower() == 'y':
                    print("\n--- FULL ACTUAL SECTION 7 ---")
                    print(actual_section7)

        # Reset for next iteration
        cik = None
        year = None

        # Ask if the user wants to try another company
        another = input("\nWould you like to try another company? (y/n): ")
        if another.lower() != 'y':
            break


Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-bsj9i7k5/unsloth_b17baf945e4b4505bef15920d1bbb362
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-bsj9i7k5/unsloth_b17baf945e4b4505bef15920d1bbb362
  Resolved https://github.com/unslothai/unsloth.git to commit 2ba60522600bb2cebcefd39d4516b54d99f4ad70
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.4.4 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.4.4-py3-none-any.whl.metadata (8.0 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git

Collecting xformers<0.0.27
  Downloading xformers-0.0.26.post1-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.26.post1-cp311-cp311-manylinux2014_x86_64.whl (222.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.8/222.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.15.2
    Uninstalling trl-0.15.2:
      Successfully uninstalled trl-0.15.2
Successfully installed trl-0.8.6 xformers-0.0.26.post1
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  

    PyTorch 2.3.0+cu121 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.9 (you have 3.11.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


GPU available: True
GPU name: NVIDIA A100-SXM4-40GB
GPU memory: 42.474471424 GB


In [None]:
# Main function to tie everything together
def main():
    # Load datasets
    combined_dataset = load_edgar_datasets()

    # Preprocess datasets with more lenient constraints
    min_section7_length = 150
    min_section8_length = 100
    max_text_length = 8000

    train_dataset = preprocess_dataset(
        combined_dataset['train'],
        min_section7_length=min_section7_length,
        min_section8_length=min_section8_length,
        max_length=max_text_length
    )
    val_dataset = preprocess_dataset(
        combined_dataset['validation'],
        min_section7_length=min_section7_length,
        min_section8_length=min_section8_length,
        max_length=max_text_length
    )
    test_dataset = preprocess_dataset(
        combined_dataset['test'],
        min_section7_length=min_section7_length,
        min_section8_length=min_section8_length,
        max_length=max_text_length
    )

    print(f"Processed train dataset size: {len(train_dataset)}")
    print(f"Processed validation dataset size: {len(val_dataset)}")
    print(f"Processed test dataset size: {len(test_dataset)}")

    # Check for sufficient data
    if len(train_dataset) < 50:
        print("WARNING: Training dataset is still very small. Consider further relaxing filters.")

    # Setup model
    model, tokenizer = setup_model()

    # OPTION 1: Train the model
    train_model_option = input("Do you want to train the model? (y/n): ")
    if train_model_option.lower() == 'y':
        model, tokenizer, training_duration = train_model(model, tokenizer, train_dataset, val_dataset)
    else:
        print("Skipping training phase.")
        try:
            # Try to load the model if available
            print("Attempting to load saved model...")
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name="./edgar_llama_model",
                max_seq_length=4096,
                load_in_4bit=True,
                device_map="auto" if torch.cuda.is_available() else "cpu",
            )
            print("Model loaded successfully.")
        except Exception as e:
            print(f"Error loading saved model: {e}")
            print("Proceeding with untrained model. Results may not be optimal.")

    # OPTION 2: Test on random examples
    test_option = input("Do you want to test the model on random examples? (y/n): ")
    if test_option.lower() == 'y':
        num_examples = int(input("How many examples to test? "))
        if num_examples > 0:
            test_metrics = test_model_on_examples(model, tokenizer, test_dataset, num_examples=num_examples)

    # OPTION 3: Interactive testing with CIK and year
    interactive_option = input("Do you want to use interactive testing with specific CIK and year? (y/n): ")
    if interactive_option.lower() == 'y':
        # Create a merged dataset for searching
        all_data = []
        all_data.extend(train_dataset)
        all_data.extend(val_dataset)
        all_data.extend(test_dataset)
        interactive_test_with_cik(model, tokenizer, all_data)

    print("\nAll tests completed.")
    print("\nTraining and evaluation summary:")
    print(f"Training dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")

# Run the main function when the script is executed directly
if __name__ == "__main__":
     main()

Loading datasets from years: ['year_2006', 'year_2007', 'year_2008', 'year_2009', 'year_2010']
Loading year_2006...


README.md:   0%|          | 0.00/43.7k [00:00<?, ?B/s]

edgar-corpus.py:   0%|          | 0.00/4.64k [00:00<?, ?B/s]

The repository for eloukas/edgar-corpus contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/eloukas/edgar-corpus.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


2006/train.jsonl:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

2006/test.jsonl:   0%|          | 0.00/165M [00:00<?, ?B/s]

2006/validate.jsonl:   0%|          | 0.00/163M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7064 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/883 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/883 [00:00<?, ? examples/s]

Loading year_2007...


2007/train.jsonl:   0%|          | 0.00/1.31G [00:00<?, ?B/s]

2007/test.jsonl:   0%|          | 0.00/158M [00:00<?, ?B/s]

2007/validate.jsonl:   0%|          | 0.00/169M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6683 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/836 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/835 [00:00<?, ? examples/s]

Loading year_2008...


2008/train.jsonl:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

2008/test.jsonl:   0%|          | 0.00/190M [00:00<?, ?B/s]

2008/validate.jsonl:   0%|          | 0.00/192M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7408 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/927 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/926 [00:00<?, ? examples/s]

Loading year_2009...


2009/train.jsonl:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

2009/test.jsonl:   0%|          | 0.00/199M [00:00<?, ?B/s]

2009/validate.jsonl:   0%|          | 0.00/191M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7336 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/917 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/917 [00:00<?, ? examples/s]

Loading year_2010...


2010/train.jsonl:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

2010/test.jsonl:   0%|          | 0.00/194M [00:00<?, ?B/s]

2010/validate.jsonl:   0%|          | 0.00/195M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7013 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/877 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/877 [00:00<?, ? examples/s]

Combined train dataset size: 35504
Combined validation dataset size: 4440
Combined test dataset size: 4438
Processed train dataset size: 2880
Processed validation dataset size: 362
Processed test dataset size: 368
Available GPU memory: 42.47 GB
Using larger model: meta-llama/Llama-2-7b-chat-hf
==((====))==  Unsloth 2025.4.5: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.4.5 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Do you want to train the model? (y/n): y


Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,880 | Num Epochs = 5 | Total steps = 1,800
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 39,976,960/7,000,000,000 (0.57% trained)


Starting training...
Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
