<a href="https://colab.research.google.com/github/noodlesbug/minnat/blob/main/note1_finalWorkable.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Install necessary libraries
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes datasets
# FIXED: FORCE UPGRADE TRANSFORMERS TO VERSION THAT SUPPORTS 'evaluation_strategy'
!pip install -U "transformers>=4.30.0"


# IMPROVED IMPORTS: MAKE SURE UNSLOTH IS IMPORTED FIRST
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template

import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import TrainingArguments, TextStreamer
from trl import SFTTrainer
import random  # ADDED FOR DATA SAMPLING

# Verify GPU availability
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    print("GPU memory:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

# USING MULTIPLE YEARS INSTEAD OF JUST ONE YEAR
print("Loading datasets from multiple years...")
years = ["year_2006", "year_2007", "year_2008", "year_2009", "year_2010"]
combined_dataset = {"train": [], "validation": [], "test": []}

# COMBINE DATA FROM MULTIPLE YEARS TO INCREASE DATASET SIZE
for year in years:
    try:
        print(f"Loading {year}...")
        edgar_dataset = load_dataset("eloukas/edgar-corpus", year)
        for split in ["train", "validation", "test"]:
            if split in edgar_dataset:
                combined_dataset[split].extend(edgar_dataset[split])
    except Exception as e:
        print(f"Error loading {year}: {e}")

# Convert to Dataset objects
for split in combined_dataset:
    combined_dataset[split] = Dataset.from_list(combined_dataset[split])
    print(f"Combined {split} dataset size: {len(combined_dataset[split])}")

# IMPROVED DATA PREPROCESSING WITH MORE LENIENT FILTERS
def preprocess_dataset(dataset_split, min_section7_length=100, min_section8_length=50, max_length=8000):
    """
    Extract sections 7 and 8 and filter based on length constraints.
    MUCH MORE LENIENT FILTERING TO RETAIN MORE EXAMPLES
    """
    filtered_data = []

    for item in dataset_split:
        section7 = item.get('section_7', '')
        section8 = item.get('section_8', '')

        # LESS RESTRICTIVE FILTERING
        if (section7 and section8 and
            len(section7.strip()) >= min_section7_length and
            len(section8.strip()) >= min_section8_length and
            len(section7) <= max_length and
            len(section8) <= max_length):

            # CLEAN TEXT TO IMPROVE QUALITY
            section7 = section7.replace('\t', ' ').replace('\r', ' ')
            section8 = section8.replace('\t', ' ').replace('\r', ' ')

            filtered_data.append({
                'section_7': section7,
                'section_8': section8,
                'cik': item['cik'],
                'year': item['year']
            })

    return Dataset.from_list(filtered_data)

# MUCH MORE LENIENT FILTERS - LESS RESTRICTIVE MINIMUM LENGTHS
min_section7_length = 150  # LOWER MINIMUM LENGTH
min_section8_length = 100  # LOWER MINIMUM LENGTH
max_text_length = 8000     # INCREASED MAXIMUM LENGTH

# Process each split with more lenient constraints
train_dataset = preprocess_dataset(
    combined_dataset['train'],  # USE COMBINED DATASET
    min_section7_length=min_section7_length,
    min_section8_length=min_section8_length,
    max_length=max_text_length
)
val_dataset = preprocess_dataset(
    combined_dataset['validation'],  # USE COMBINED DATASET
    min_section7_length=min_section7_length,
    min_section8_length=min_section8_length,
    max_length=max_text_length
)
test_dataset = preprocess_dataset(
    combined_dataset['test'],  # USE COMBINED DATASET
    min_section7_length=min_section7_length,
    min_section8_length=min_section8_length,
    max_length=max_text_length
)

print(f"Processed train dataset size: {len(train_dataset)}")
print(f"Processed validation dataset size: {len(val_dataset)}")
print(f"Processed test dataset size: {len(test_dataset)}")

# CHECK IF WE HAVE ENOUGH DATA, IF NOT, PRINT WARNING
if len(train_dataset) < 50:
    print("WARNING: Training dataset is still very small. Consider further relaxing filters.")

# SAMPLE DATA TO CHECK QUALITY
random_sample = random.choice(train_dataset)
print("\nSAMPLE INPUT (truncated):")
print(random_sample['section_8'][:300] + "...\n")
print("\nSAMPLE OUTPUT (truncated):")
print(random_sample['section_7'][:300] + "...\n")

# Check some statistics about our data
train_input_lengths = [len(x['section_8']) for x in train_dataset]
train_target_lengths = [len(x['section_7']) for x in train_dataset]

print(f"Average input length: {np.mean(train_input_lengths):.2f} characters")
print(f"Average target length: {np.mean(train_target_lengths):.2f} characters")
print(f"Max input length: {np.max(train_input_lengths)} characters")
print(f"Max target length: {np.max(train_target_lengths)} characters")

# IMPROVED PROMPT FORMATTING WITH CLEARER INSTRUCTIONS
def format_prompt(example):
    """Format the example as a prompt with BETTER INSTRUCTIONS."""

    # IMPROVED SYSTEM PROMPT WITH MORE CONTEXT
    system_prompt = "You are a financial analyst who specializes in SEC filings. Your task is to convert financial statements to a comprehensive management's discussion and analysis that analyzes financial condition, changes in financial condition, and results of operations."

    # IMPROVED USER PROMPT WITH CLEARER TASK DESCRIPTION
    user_prompt = f"""Financial Statements (Section 8):
{example['section_8']}

Generate a detailed Management's Discussion and Analysis (Section 7) that thoroughly explains the financial results, trends, and business conditions reflected in these statements."""

    # The expected assistant response is the actual Section 7
    assistant_prompt = example['section_7']

    # Format based on Llama's chat template
    formatted_prompt = {
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": assistant_prompt}
        ]
    }

    return formatted_prompt

# Apply formatting to datasets
train_dataset_formatted = train_dataset.map(format_prompt)
val_dataset_formatted = val_dataset.map(format_prompt)
test_dataset_formatted = test_dataset.map(format_prompt)

# USING A LARGER MODEL IF POSSIBLE, OTHERWISE STICK WITH TINYLLAMA
# CHECK AVAILABLE MEMORY TO DECIDE MODEL SIZE
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 if torch.cuda.is_available() else 0
print(f"Available GPU memory: {gpu_memory:.2f} GB")

# CHOOSE MODEL BASED ON AVAILABLE MEMORY
if gpu_memory > 35:  # For A100 40GB or larger
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    print(f"Using larger model: {model_name}")
elif gpu_memory > 15:  # For V100 16GB or similar
    model_name = "NousResearch/Nous-Hermes-llama-2-7b"
    print(f"Using medium-sized model: {model_name}")
else:  # For smaller GPUs
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    print(f"Using small model due to memory constraints: {model_name}")

# IMPROVED MODEL LOADING WITH BETTER CONFIGURATION
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=4096,
        load_in_4bit=True,    # Use 4-bit quantization to reduce memory usage
        # MORE OPTIMAL SETTINGS FOR STABILITY
        device_map="auto",    # Let the library handle device mapping
        attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
    )
except Exception as e:
    print(f"Error loading model {model_name}: {e}")
    print("Falling back to TinyLlama model...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        max_seq_length=4096,
        load_in_4bit=True,
        device_map="auto" if torch.cuda.is_available() else "cpu",
    )

# IMPROVED LORA CONFIGURATION FOR BETTER FINE-TUNING
model = FastLanguageModel.get_peft_model(
    model,
    r=16,               # Rank of LoRA adapters
    target_modules=[    # TARGET ALL PROJECTION MODULES
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=32,      # INCREASED ALPHA FOR BETTER ADAPTATION
    lora_dropout=0.05,  # SMALL DROPOUT FOR REGULARIZATION
    bias="none",
)

# Format datasets with the chat template
def format_with_chat_template(example):
    """Apply the chat template to format the prompt."""
    formatted = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text": formatted}

# Apply formatting to datasets
train_dataset_formatted = train_dataset_formatted.map(format_with_chat_template)
val_dataset_formatted = val_dataset_formatted.map(format_with_chat_template)

# IMPROVED TRAINING ARGUMENTS FOR BETTER RESULTS
# BETTER LEARNING RATE, WARMUP, AND SCHEDULING
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,                # SLIGHTLY MORE EPOCHS FOR BETTER LEARNING
    per_device_train_batch_size=1,     # Small batch size for memory constraints
    gradient_accumulation_steps=8,     # INCREASED ACCUMULATION STEPS FOR STABILITY
    warmup_ratio=0.1,                  # USE RATIO INSTEAD OF STEPS FOR BETTER SCALING
    learning_rate=1e-4,                # LOWER LEARNING RATE FOR STABILITY
    fp16=not is_bfloat16_supported(),  # Use fp16 if bfloat16 is not supported
    bf16=is_bfloat16_supported(),      # Use bfloat16 if supported
    logging_steps=5,
    optim="adamw_8bit",                # 8-bit optimizer for memory efficiency
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    save_strategy="epoch",             # SAVE AT EACH EPOCH
    load_best_model_at_end=True,       # LOAD BEST MODEL AT END BASED ON EVAL
    eval_strategy="epoch" if len(val_dataset_formatted) > 0 else "no",
    report_to="none",                  # Disable wandb, tensorboard, etc.
)

# IMPROVED TRAINER WITH EVALUATION SUPPORT
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_formatted,
    eval_dataset=val_dataset_formatted if len(val_dataset_formatted) > 0 else None,
    dataset_text_field="text",         # Field that contains the formatted prompt
    max_seq_length=4096,
    tokenizer=tokenizer,
    packing=False,                     # Don't pack sequences
)

# ADD CHECKPOINT CALLBACK TO SAVE INTERMEDIATE MODELS
# Train the model
print("Starting training...")
train_result = trainer.train()

# Save the fine-tuned model
trainer.save_model("./edgar_llama_model")

# IMPROVED GENERATION FUNCTION WITH BETTER PARAMETERS
def generate_prediction(example):
    """Generate a prediction with IMPROVED GENERATION PARAMETERS."""

    # Create the prompt with IMPROVED SYSTEM PROMPT
    system_prompt = "You are a financial analyst who specializes in SEC filings. Your task is to convert financial statements to a comprehensive management's discussion and analysis that analyzes financial condition, changes in financial condition, and results of operations."

    # BETTER USER PROMPT WITH CLEARER TASK SPECIFICATION
    user_prompt = f"""Financial Statements (Section 8):
{example['section_8']}

Generate a detailed Management's Discussion and Analysis (Section 7) that thoroughly explains the financial results, trends, and business conditions reflected in these statements."""

    # Format messages for the model
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    # Apply chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # IMPROVED GENERATION PARAMETERS
    #streamer = TextStreamer(tokenizer)
    output = model.generate(
        **inputs,
        max_new_tokens=2048,       # INCREASED TOKEN LIMIT FOR LONGER OUTPUTS
        #streamer=streamer,
        top_p=0.85,                # SLIGHTLY MORE FOCUSED SAMPLING
        top_k=50,                  # ADDED TOP-K SAMPLING
        temperature=0.7,
        do_sample=True,
        repetition_penalty=1.1,    # PREVENT REPETITIVE TEXT
        use_cache=True
    )

    # Decode and extract the response
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # IMPROVED RESPONSE EXTRACTION
    # Try to get text after assistant tag, fallback to full output if not found
    if '<assistant>' in decoded_output:
        response = decoded_output.split('<assistant>')[-1].strip()
    else:
        # Try alternative format or fall back to full output
        response = decoded_output

    return response

# IMPROVED TEST FUNCTION TO CHECK MULTIPLE EXAMPLES
# Test on more examples to get a better sense of model performance
def test_model_on_examples(test_dataset, num_examples=3):
    """Test the model on multiple examples."""
    if len(test_dataset) < num_examples:
        num_examples = len(test_dataset)
        print(f"Warning: Only {num_examples} examples available for testing.")

    examples = random.sample(list(range(len(test_dataset))), num_examples)

    for i, idx in enumerate(examples):
        print(f"\n\n========== TESTING MODEL ON EXAMPLE #{i+1} ==========")
        example = test_dataset[idx]

        print("Input (Section 8 excerpt):")
        print(example['section_8'][:500] + "...\n")

        print("Generating prediction...")
        prediction = generate_prediction(example)

        print("Predicted Section 7 (excerpt):")
        print(prediction[:500] + "...\n")

        print("Actual Section 7 (excerpt):")
        print(example['section_7'][:500] + "...\n")

        # ADD SIMPLE EVALUATION
        overlap = len(set(prediction.split()) & set(example['section_7'].split())) / len(set(example['section_7'].split()))
        print(f"Word overlap with actual text: {overlap:.2%}")

# Run the test function
if len(test_dataset) > 0:
    test_model_on_examples(test_dataset, num_examples=3)
else:
    print("No test examples available.")

# Save tokenizer and model configuration
tokenizer.save_pretrained("./edgar_llama_tokenizer")

print("Fine-tuning complete! The model adapters have been saved.")

# ADDED INTERACTIVE TEST FUNCTION
def interactive_test():
    """Allow the user to input their own Section 8 text for testing."""
    print("\n\n========== INTERACTIVE TESTING ==========")
    print("Enter your own Section 8 text to generate a Section 7 MD&A.")
    print("Type 'exit' to quit.")

    while True:
        user_input = input("\nEnter Section 8 text (or 'exit'): ")
        if user_input.lower() == 'exit':
            break

        example = {'section_8': user_input}
        print("\nGenerating prediction...")
        prediction = generate_prediction(example)

        print("\nPredicted Section 7:")
        print(prediction)

# Offer interactive testing
print("\nWould you like to test the model with your own input? (y/n)")
response = input()
if response.lower() == 'y':
    interactive_test()

print("\n\nTraining summary:")
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print(f"Model used: {model_name}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print("Fine-tuning complete!")

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-6ssegblb/unsloth_85ea09c4b111472997af5d90c106d37b
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-6ssegblb/unsloth_85ea09c4b111472997af5d90c106d37b
  Resolved https://github.com/unslothai/unsloth.git to commit 0096445910418fe051d4b3eb0f866ee781344b76
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
GPU available: True
GPU name: NVIDIA A100-SXM4-40GB
GPU memory: 42.474471424 GB
Loading datasets from multiple years...
Loading year_2006...
Loading year_2007...
Loading year_2008...
Loading year_2009...
Loading year_2010...
Combined train dataset size: 35504
Combined validation dataset size: 4440
Combin

Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

Map:   0%|          | 0/368 [00:00<?, ? examples/s]

Available GPU memory: 42.47 GB
Using larger model: meta-llama/Llama-2-7b-chat-hf
==((====))==  Unsloth 2025.4.4: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,880 | Num Epochs = 5 | Total steps = 1,800
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 39,976,960/7,000,000,000 (0.57% trained)


Starting training...
Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.0006,2.050873
2,0.0007,2.070718
3,0.0005,2.076702
4,0.0007,2.08087
5,0.0005,2.079037


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient




Input (Section 8 excerpt):
ITEM 8. FINANCIAL STATEMENTS
AND SUPPLEMENTARY DATA
AEP, APCo, CSPCo, I&M, OPCo, PSO and SWEPCo. The information required by this item is incorporated herein by reference to the financial statements and financial statement schedules described under Item 15 herein.
ITEM 9....

Generating prediction...
Predicted Section 7 (excerpt):
[INST] <<SYS>>
You are a financial analyst who specializes in SEC filings. Your task is to convert financial statements to a comprehensive management's discussion and analysis that analyzes financial condition, changes in financial condition, and results of operations.
<</SYS>>

Financial Statements (Section 8):
ITEM 8. FINANCIAL STATEMENTS
AND SUPPLEMENTARY DATA
AEP, APCo, CSPCo, I&M, OPCo, PSO and SWEPCo. The information required by this item is incorporated herein by reference to the financia...

Actual Section 7 (excerpt):
ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS
OF FINANCIAL CONDITION
AND RESULTS OF OPERATION
CSPCo and I&