In [1]:
!pip install datasets
!pip install transformers
!pip install peft
!pip install ipywidgets
!jupyter labextension install @jupyter-widgets/jupyterlab-manager

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Config option `kernel_spec_manager_class` not recognized by `InstallLabExtensionApp`.
[33m(Deprecated) Installing extensions with the jupyter labextension install command is now deprecated and will be removed in a future major version of JupyterLab.

Users should manage prebuilt extensions with package managers like pip and conda, and extension authors are encouraged to distribute their extensions as prebuilt packages [0m
[33m[W 2024-11-26 17:14:14.808 LabApp][m Config option `kernel_spec_manager_class` not recognized by `LabApp`.
An error occurred.
ValueError: Please install Node.js and npm before continuing installation. You may be able to install Node.js from your package 

In [2]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()

huggingface_key = os.getenv("HUGGINGFACE_KEY")

login(huggingface_key)

In [3]:
# Importing libraries

import os
import json
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from tqdm import tqdm

In [4]:
# Step 1: Load all JSON files from the directory
def load_data_from_directory(directory_path):
    all_texts = []
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        if file_name.endswith(".json"):  # Only process JSON files
            with open(file_path, 'r') as file:
                data = json.load(file)
                # Combine the relevant fields into a single text entry
                text = "\n".join([value for key, value in data.items() if isinstance(value, str)])
                all_texts.append(text)
    return all_texts

# Directory containing JSON files
directory_path = "scratch/data/10-K"
all_texts = load_data_from_directory(directory_path)

In [5]:
# Save combined texts to a single file
fine_tune_text_file = "scratch/fine_tune_data.txt"
with open(fine_tune_text_file, 'w') as f:
    f.write("\n\n".join(all_texts))  # Separate each file's content with a blank line

In [6]:
# Step 2: Load Tokenizer and Model
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model.gradient_checkpointing_enable()

In [7]:
def create_datasets(file_path, tokenizer, block_size=512, eval_split=0.1):
    # Read the text file
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # Calculate split point
    split_point = int(len(text) * (1 - eval_split))
    train_text = text[:split_point]
    eval_text = text[split_point:]
    
    # Create temporary files for train and eval
    import tempfile
    
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as train_file:
        train_file.write(train_text)
        train_path = train_file.name
        
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as eval_file:
        eval_file.write(eval_text)
        eval_path = eval_file.name
    
    # Create datasets
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=block_size
    )
    
    eval_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=eval_path,
        block_size=block_size
    )
    
    # Clean up temporary files
    import os
    os.unlink(train_path)
    os.unlink(eval_path)
    
    return train_dataset, eval_dataset

# Create train and eval datasets
train_dataset, eval_dataset = create_datasets(fine_tune_text_file, tokenizer)


Token indices sequence length is longer than the specified maximum sequence length for this model (25524228 > 131072). Running this sequence through the model will result in indexing errors


In [8]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Step 4: Fine-tuning
training_args = TrainingArguments(
    output_dir="./scratch/finetuned_model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    fp16=True,  # Use mixed precision for large models
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
500,1.2112,1.180164


TrainOutput(global_step=780, training_loss=1.3379976908365885, metrics={'train_runtime': 7039.403, 'train_samples_per_second': 7.096, 'train_steps_per_second': 0.111, 'total_flos': 1.4894460569124864e+17, 'train_loss': 1.3379976908365885, 'epoch': 0.9993593850096092})

In [9]:
# Save the fine-tuned model
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/tokenizer.json')

In [10]:
prompt = """
You are a financial advisor responsible for helping train an AI language model
to provide comprehensive, sound financial advice based on a company's financial 
history. You are tasked with writing questions and ground-truth answers for the 
task's benchmark dataset.

You will be provided a set of historical data on a given company. Given this data, 
you should come up with a question that would effectively test an LLM's ability to
give coherent and correct information about a company. The LLM may also be asked to
give some subjective advice about a company's financial outlook. In these cases, while
there isn't necessarily a "correct" answer, any LLM answer should be supported clearly
by the provided data. The questions you create should have these goals in mind, and the 
answers you generate should appropriately address the goals.

Format your output in the following format:

Do not include anything else in your response. 

Here is an example of what your output could look like:

<<Example>>

What do AAPL's earnings reports say about it's growth potential?

Investors can be confident about AAPL's long-term growth potential. It has showed 
consistent growth year-over-year, with revenue figures increasing by at least 2 percent
in every year. 

Here is the user input:

{query}

Don't wrap the JSON output in anything (markdown, etc). Just return the JSON object itself.
"""

In [11]:
# Step 5: Evaluation
eval_data_file = "evaluation_dataset.csv"
eval_data = pd.read_csv(eval_data_file)

# Ensure eval_data contains "question" column
if "Question" not in eval_data.columns:
    raise ValueError("The evaluation dataset must have a 'Question' column.")

tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.pad_token_id

complete_list = []
k = 0

for question in tqdm(eval_data["Question"]):
    # Tokenize input with padding and return attention mask
    inputs = tokenizer(
        question,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=400  # Set as per your model's context size
    ).to("cuda")

    # Generate outputs with attention mask
    outputs = model.generate(
        inputs["input_ids"],
        max_length=200,
        attention_mask=inputs["attention_mask"],
        pad_token_id=tokenizer.pad_token_id
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    complete_list.append({
        "no": k,
        "question":question,
        "answer": answer
    })
    k += 1

# Save results to CSV
df = pd.DataFrame(complete_list)

df.to_csv("scratch/llama-answers.csv", index=False)

print("Evaluation complete. Results saved to 'evaluation_results.csv'.")

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1210/1210 [1:37:02<00:00,  4.81s/it]

Evaluation complete. Results saved to 'evaluation_results.csv'.



