In [2]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",
    "unsloth/Mistral-Small-Instruct-2409",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",

    "unsloth/Llama-3.2-1B-bnb-4bit",
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.11.10 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import json
import os

def ingest_train_data(directory: str):
    all_data = []
    for file in os.listdir(directory):
        path = os.path.join(directory, file)
        with open(path, 'r') as f:
            data = json.load(f)
            company_name = data['company']
            del_keys = [
                'state_location'
                'cik',
                'company',
                'filing_type',
                'filing_date',
                'period_of_report',
                'sic',
                'state_of_inc',
                'fiscal_year_end',
                'filing_html_index',
                'htm_filing_link',
                'complete_text_filing_link',
                'filename'
            ]
            all_data.extend([(data[text], text, company_name) for text in data.keys() if text not in del_keys])
    return all_data


def create_master_dataset(directory: str):
    df = pd.DataFrame(columns=["prompt", "completion"])
    data = ingest_train_data(directory=directory)
    for completion, item, company in data:
        prompt = f"What did {item} in {company}'s SEC 10-K filing say?"
        df.loc[len(df.index)] = [prompt, completion]
    return df

directory = '/content/drive/MyDrive/10-K'
master_set = create_master_dataset(directory=directory)

In [7]:
from transformers import AutoTokenizer
from datasets import Dataset


alpaca_prompt = """### Prompt:
{}

### Completion:
{}"""

EOS_TOKEN = tokenizer.eos_token or ""

def formatting_prompts_func(examples):
    prompts = examples["prompt"]
    completions = examples["completion"]
    texts = [
        alpaca_prompt.format(prompt, completion) + EOS_TOKEN
        for prompt, completion in zip(prompts, completions)
    ]
    return {"text": texts}

if isinstance(master_set, pd.DataFrame):
    master_set = master_set.to_dict(orient="records")  # Convert DataFrame to list of dicts

train_dataset = Dataset.from_list(master_set)

train_dataset = train_dataset.map(formatting_prompts_func, batched=True)

print(train_dataset[0])




Map:   0%|          | 0/11960 [00:00<?, ? examples/s]

{'prompt': "What did cik in MICROSOFT CORP's SEC 10-K filing say?", 'completion': '789019', 'text': "### Prompt:\nWhat did cik in MICROSOFT CORP's SEC 10-K filing say?\n\n### Completion:\n789019<|end_of_text|>"}


In [33]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        learning_rate=2e-4,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=4,
        weight_decay=0.1,
        warmup_steps=5,
        logging_steps=50,
        save_steps=500,
        max_steps=1000,
        lr_scheduler_type="linear",
        optim="adamw_torch",
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        report_to="none",
        seed=3407,
        output_dir="output",
    )
)
print("done")

Map (num_proc=2):   0%|          | 0/11960 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


done


In [36]:
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 11,960 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
50,1.5302
100,1.394
150,1.2598
200,1.1926
250,1.2832
300,1.3039
350,1.3051
400,1.2769
450,1.2155
500,1.2059


TrainOutput(global_step=1000, training_loss=1.2199527015686036, metrics={'train_runtime': 1398.5478, 'train_samples_per_second': 5.72, 'train_steps_per_second': 0.715, 'total_flos': 1.1836784141555712e+17, 'train_loss': 1.2199527015686036, 'epoch': 0.6688963210702341})

In [8]:

model.save_pretrained("llama_3")
tokenizer.save_pretrained("llama_3")

('llama_3/tokenizer_config.json',
 'llama_3/special_tokens_map.json',
 'llama_3/tokenizer.json')

In [9]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "the revenue of the apple company?", # question
        "",
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)



<|begin_of_text|>### Prompt:
the revenue of the apple company?

### Completion:
```python
# 1. import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 2. load the data
df = pd.read_csv('apple.csv')

# 3. plot the revenue
plt.plot(df['date'], df['revenue'])
plt.show()
```
<|end_of_text|>


In [10]:
import pandas as pd

eval_data_file = "/content/drive/MyDrive/evaluation_dataset.csv"
eval_data = pd.read_csv(eval_data_file)

if "Question" not in eval_data.columns:
    raise ValueError("The evaluation dataset must have a 'Question' column.")

complete_list = []
k = 0

class CustomStreamer(TextStreamer):
    def __init__(self, tokenizer):
        super().__init__(tokenizer)
        self.text = ""

    def on_new_token(self, token: str):
        self.text += token

text_streamer = CustomStreamer(tokenizer)

for question in eval_data["Question"]:
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                question,
                "",
            )
        ], return_tensors="pt").to("cuda")

    outputs= model.generate(**inputs, max_new_tokens=128)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text.split("### Completion:")[-1].strip()

    complete_list.append({
        "no": k,
        "question": question,
        "answer": answer
    })
    print(f"Question: {question}\nAnswer: {answer}\n")
    print(k)


    k += 1

df = pd.DataFrame(complete_list)
df.to_csv("eval_result_llama", index=False)

print("Evaluation complete. Results saved to 'llama-answers.csv'.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3. The company's decision to focus on its core business and reduce its investments in new technologies led to a decrease in revenue.

4. The company's decision to reduce its workforce and close some of its manufacturing

438
Question: Palo Alto Networks' financial statements show that it has a history of losses. What are some of the reasons for this, and what does the company say about its outlook for profitability?
Answer: Palo Alto Networks' financial statements show that it has a history of losses. What are some of the reasons for this, and what does the company say about its outlook for profitability?

### Explanation:
Palo Alto Networks' financial statements show that it has a history of losses. What are some of the reasons for this, and what does the company say about its outlook for profitability?

### Explanation:
Palo Alto Networks' financial statements show that it has a history of losses. What are some of the r

In [12]:
df = pd.DataFrame(complete_list)
df.to_csv("eval_result_llama.csv", index=False)

In [13]:
eval_file = "eval_result_llama.csv"
eval2_file = "/content/drive/MyDrive/evaluation_dataset.csv"
eval_data = pd.read_csv(eval_file)
eval2_data = pd.read_csv(eval2_file)
if "answer" not in eval_data.columns or "Answer" not in eval2_data.columns:
    raise ValueError("Both evaluation datasets must have a 'Question' column.")
references = eval_data["answer"].tolist()
hypothesis = eval2_data["Answer"].tolist()




In [14]:
!pip install sacrebleu
!pip install bert_score

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacreb

In [15]:
from sacrebleu import corpus_bleu

references_for_bleu = [[ref] for ref in references]
bleu = corpus_bleu(hypothesis, references_for_bleu)
print(f"BLEU Score: {bleu.score:.4f}")

BLEU Score: 43.1842


In [16]:
from bert_score import score as bert_score

P, R, F1 = bert_score(hypothesis, references, lang="en", rescale_with_baseline=True)
print(f"BERTScore - Precision: {P.mean().item():.4f}, Recall: {R.mean().item():.4f}, F1: {F1.mean().item():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore - Precision: 0.1047, Recall: 0.1989, F1: 0.1513
