In [None]:
!pip install datasets
!pip install transformers
!pip install peft
!pip install ipywidgets
!jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [None]:
from huggingface_hub import login
login('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')

In [None]:
import json
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from typing import Dict, List

In [None]:
class SECDataProcessor:
    def __init__(self, data_dir: str, eval_file: str):
        self.data_dir = data_dir
        self.eval_file = eval_file
        self.special_tokens = {
            "company_start": "<|company|>",
            "ticker_start": "<|ticker|>",
            "filing_start": "<|filing|>",
            "item_start": "<|item|>",
            "question_start": "<|question|>",
            "answer_start": "<|answer|>",
            "sep": "<|sep|>"
        }

    def process_10k_file(self, file_path: str) -> Dict:
        """Process a single 10-K JSON file."""
        with open(file_path, 'r') as f:
            data = json.load(f)

        # Extract relevant fields
        text = f"{self.special_tokens['company_start']}{data.get('company', '')}"
        text += f"{self.special_tokens['ticker_start']}{data.get('cik', '')}"
        text += f"{self.special_tokens['filing_start']}{data.get('filing_type', '')}"

        # Add items (you can add more items as needed)
        for item_num in range(1, 8):
            item_key = f'item_{item_num}'
            if item_key in data:
                text += f"{self.special_tokens['item_start']}{item_key}: {data[item_key]}"

        return text

    def prepare_training_data(self) -> Dataset:
        """Prepare training data from 10-K files."""
        texts = []

        for filename in os.listdir(self.data_dir):
            if filename.endswith('.json'):
                file_path = os.path.join(self.data_dir, filename)
                text = self.process_10k_file(file_path)
                texts.append({"text": text})

        return Dataset.from_list(texts)

    def prepare_eval_data(self) -> Dataset:
        """Prepare evaluation data from CSV."""
        df = pd.read_csv(self.eval_file)
        eval_texts = []

        for _, row in df.iterrows():
            text = f"{self.special_tokens['company_start']}{row['Company']}"
            text += f"{self.special_tokens['question_start']}{row['Question']}"
            text += f"{self.special_tokens['answer_start']}{row['Answer']}"
            eval_texts.append({"text": text})

        return Dataset.from_list(eval_texts)

In [None]:
class LLMFineTuner:
    def __init__(self, model_name: str, special_tokens: Dict[str, str]):
        self.model_name = model_name
        self.special_tokens = special_tokens
        self.tokenizer = None
        self.model = None

    def preprocess_data(self, examples: Dict[str, List]) -> Dict[str, torch.Tensor]:
        """Preprocess data for training."""
        if self.tokenizer is None:
            raise RuntimeError("Tokenizer must be initialized first")

        # Tokenize the text directly
        encodings = self.tokenizer(
            examples['text'],
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )

        return {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "labels": encodings["input_ids"].clone()
        }

    def prepare_dataset(self, dataset: Dataset) -> Dataset:
        """Prepare dataset with proper encodings."""
        # Apply preprocessing to create properly encoded dataset
        encoded_dataset = dataset.map(
            lambda examples: self.preprocess_data(examples),
            batched=True,
            remove_columns=dataset.column_names
        )

        return encoded_dataset

    def setup_tokenizer_and_model(self):
        """Initialize and setup tokenizer and model."""
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Add special tokens
        special_tokens_dict = {'additional_special_tokens': list(self.special_tokens.values())}
        self.tokenizer.add_special_tokens(special_tokens_dict)

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        self.model.resize_token_embeddings(len(self.tokenizer))

        # Setup LoRA
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )
        self.model = get_peft_model(self.model, lora_config)

    def train(self, train_dataset: Dataset, output_dir: str):
        """Train the model."""
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            learning_rate=2e-4,
            fp16=True,
            save_strategy="epoch",
            logging_steps=100,
            optim="adamw_torch",
            remove_unused_columns=False
        )

        train_dataset = self.prepare_dataset(train_dataset)

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        )

        trainer.train()
        trainer.save_model(output_dir)

    def generate_answer(self, company: str, question: str) -> str:
        """Generate answer for a given question."""
        prompt = f"{self.special_tokens['company_start']}{company}"
        prompt += f"{self.special_tokens['question_start']}{question}"
        prompt += f"{self.special_tokens['answer_start']}"

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        outputs = self.model.generate(
            **inputs,
            max_length=1024,
            temperature=0.7,
            num_return_sequences=1,
            pad_token_id=self.tokenizer.eos_token_id
        )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
class LLMFineTuner:
    def __init__(self, model_name: str, special_tokens: Dict[str, str]):
        self.model_name = model_name
        self.special_tokens = special_tokens
        self.tokenizer = None
        self.model = None

    def preprocess_data(self, examples: Dict[str, List]) -> Dict[str, torch.Tensor]:
        """Preprocess data for training."""
        if self.tokenizer is None:
            raise RuntimeError("Tokenizer must be initialized first")

        # Tokenize the text directly
        encodings = self.tokenizer(
            examples['text'],
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )

        return {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "labels": encodings["input_ids"].clone()
        }

    def prepare_dataset(self, dataset: Dataset) -> Dataset:
        """Prepare dataset with proper encodings."""
        # Apply preprocessing to create properly encoded dataset
        encoded_dataset = dataset.map(
            lambda examples: self.preprocess_data(examples),
            batched=True,
            remove_columns=dataset.column_names
        )

        return encoded_dataset

    def setup_tokenizer_and_model(self):
        """Initialize and setup tokenizer and model."""
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Add special tokens
        special_tokens_dict = {'additional_special_tokens': list(self.special_tokens.values())}
        self.tokenizer.add_special_tokens(special_tokens_dict)

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        self.model.resize_token_embeddings(len(self.tokenizer))

        # Setup LoRA
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )
        self.model = get_peft_model(self.model, lora_config)

    def train(self, train_dataset: Dataset, output_dir: str):
        """Train the model."""
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            learning_rate=2e-4,
            fp16=True,
            save_strategy="epoch",
            logging_steps=100,
            optim="adamw_torch",
            remove_unused_columns=False
        )

        train_dataset = self.prepare_dataset(train_dataset)

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        )

        trainer.train()
        trainer.save_model(output_dir)

    def generate_answer(self, company: str, question: str) -> str:
        """Generate answer for a given question."""
        prompt = f"{self.special_tokens['company_start']}{company}"
        prompt += f"{self.special_tokens['question_start']}{question}"
        prompt += f"{self.special_tokens['answer_start']}"

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        outputs = self.model.generate(
            **inputs,
            max_length=1024,
            temperature=0.7,
            num_return_sequences=1,
            pad_token_id=self.tokenizer.eos_token_id
        )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
import os
import json
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Step 1: Load all JSON files from the directory
def load_data_from_directory(directory_path):
    all_texts = []
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        if file_name.endswith(".json"):  # Only process JSON files
            with open(file_path, 'r') as file:
                data = json.load(file)
                # Combine the relevant fields into a single text entry
                text = "\n".join([value for key, value in data.items() if isinstance(value, str)])
                all_texts.append(text)
    return all_texts

# Directory containing JSON files
directory_path = "data/10-K"
all_texts = load_data_from_directory(directory_path)

# Save combined texts to a single file
fine_tune_text_file = "fine_tune_data.txt"
with open(fine_tune_text_file, 'w') as f:
    f.write("\n\n".join(all_texts))  # Separate each file's content with a blank line

# Step 2: Load Tokenizer and Model
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model.gradient_checkpointing_enable()


# Step 3: Create Dataset
def create_dataset(file_path, tokenizer, block_size=512):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

dataset = create_dataset(fine_tune_text_file, tokenizer)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Step 4: Fine-tuning
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    learning_rate=5e-5,
    fp16=True,  # Use mixed precision for large models
    eval_strategy="no",
    save_strategy="steps",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained("scratch/finetuned_model")
tokenizer.save_pretrained("scratch/finetuned_model")

# Step 5: Evaluation
eval_data_file = "evaluation_dataset.csv"
eval_data = pd.read_csv(eval_data_file)

# Ensure eval_data contains "question" column
if "Question" not in eval_data.columns:
    raise ValueError("The evaluation dataset must have a 'Question' column.")

tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.pad_token_id

complete_list = []
k = 0

for question in tqdm(eval_data["Question"]):
    # Tokenize input with padding and return attention mask
    inputs = tokenizer(
        question,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=200  # Set as per your model's context size
    ).to("cuda")

    # Generate outputs with attention mask
    outputs = model.generate(
        inputs["input_ids"],
        max_length=200,
        attention_mask=inputs["attention_mask"],
        pad_token_id=tokenizer.pad_token_id
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    complete_list.append({
        "no": k,
        "question":question,
        "answer": answer
    })
    k += 1
    answers.append(answer)

# Save results to CSV
df = pd.DataFrame(complete_list)

df.to_csv("scratch/llama-answers.csv", index=False)

print("Evaluation complete. Results saved to 'evaluation_results.csv'.")


In [None]:
# Save results to CSV
df = pd.DataFrame(complete_list)

df.to_csv("scratch/llama-answers.csv", index=False)

print("Evaluation complete. Results saved to 'evaluation_results.csv'.")