## Part 1: Model Finetuning (HuggingFace, MLX)

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
model_name = "unsloth/Phi-3-mini-4k-instruct"
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Mistral patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [None]:
# Parameters for finetuning

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing='unsloth',
    random_state=3407,
    use_rslora=False,
    loftq_config=None
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.11.10 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
# Now the dataset
import pandas as pd
import json
import os

def ingest_train_data(directory: str):
    all_data = []
    for file in os.listdir(directory):
        path = os.path.join(directory, file)
        with open(path, 'r') as f:
            data = json.load(f)
            company_name = data['company']
            # drop keys that don't matter
            del_keys = [
                'state_location'
                'cik',
                'company',
                'filing_type',
                'filing_date',
                'period_of_report',
                'sic',
                'state_of_inc',
                'fiscal_year_end',
                'filing_html_index',
                'htm_filing_link',
                'complete_text_filing_link',
                'filename'
            ]
            all_data.extend([(data[text], text, company_name) for text in data.keys() if text not in del_keys])
    return all_data


def create_master_dataset(directory: str):
    df = pd.DataFrame(columns=["prompt", "completion"])
    data = ingest_train_data(directory=directory)
    for completion, item, company in data:
        prompt = f"What did {item} in {company}'s SEC 10-K filing say?"
        df.loc[len(df.index)] = [prompt, completion]
    return df

directory = './data/'
master_set = create_master_dataset(directory=directory)


In [None]:
!pip install datasets



In [None]:
# format dataset how unsloth wants
from unsloth import to_sharegpt
from datasets import Dataset

dataset = Dataset.from_pandas(master_set)

dataset = to_sharegpt(
    dataset,
    merged_prompt = "Your input is: {prompt}",
    output_column_name='completion',
)

from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

Merging columns:   0%|          | 0/850 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/850 [00:00<?, ? examples/s]

Standardizing format:   0%|          | 0/850 [00:00<?, ? examples/s]

In [None]:
# Now it's time to start training the model
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        output_dir="outputs",
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        report_to='none'
    )
)

Map (num_proc=2):   0%|          | 0/850 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 850 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 60
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss
1,1.8311
2,1.7126
3,1.4726
4,1.9622
5,1.7849
6,1.5153
7,1.7181
8,1.8671
9,1.7495
10,1.385


In [None]:
# The Convergence seems acceptable at <0.8 loss, so go ahead and save the model
model.save_pretrained('bds_phi3_ft')
tokenizer.save_pretrained('bds_phi3_ft')

('bds_phi3_ft/tokenizer_config.json',
 'bds_phi3_ft/special_tokens_map.json',
 'bds_phi3_ft/tokenizer.model',
 'bds_phi3_ft/added_tokens.json',
 'bds_phi3_ft/tokenizer.json')

## Part 2: Saving to Ollama and Running Eval

In [None]:
# First we have to install ollama
%%capture
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
# Save this in 8B for now (faster)
model.save_pretrained_gguf("ft_model", tokenizer)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.3G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.79 out of 12.67 RAM for saving.


100%|██████████| 32/32 [00:01<00:00, 16.09it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving ft_model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving ft_model/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting mistral model. Can use fast conversion = True.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at ft_model into q8_0 GGUF format.
The output location will be /content/ft_model/unsloth.Q8_0.gguf
This will take 3 minutes...


Unsloth: Extending ft_model/tokenizer.model with added_tokens.json.
Originally tokenizer.model is of size (32000).
But we need to extend to sentencepiece vocab size (32011).


INFO:hf-to-gguf:Loading model: ft_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00002.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> Q8_0, shape = {3072, 32064}
INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.float16 --> Q8_0, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float16 --> Q8_0, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_v.weight,         torch.float16 --> Q8_0, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.float16 --> Q8_0, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> Q8_0, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> Q8_0, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> Q8_

In [None]:
# Run Ollama in the background
# import subprocess
# subprocess.Popen(["ollama", "serve"])
# import time
# time.sleep(5)

# Try it in the foreground
!nohup ollama serve > ollama.log 2>&1 &

In [None]:
import shutil
from google.colab import files

shutil.make_archive('/content/ft_model', 'zip', 'content/ft_model')
files.download('/content/ft_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Try to create the model file from the GGUF thing
with open('/content/ft_model/Modelfile', 'w') as f:
    f.write("FROM /content/ft_model/unsloth.Q8_0.gguf")

In [None]:
!ollama create ft_model_phi -f ./ft_model/Modelfile

[?25ltransferring model data ⠙ [?25h[?25l[2K[1Gtransferring model data ⠹ [?25h[?25l[2K[1Gtransferring model data ⠸ [?25h[?25l[2K[1Gtransferring model data ⠼ [?25h[?25l[2K[1Gtransferring model data ⠴ [?25h[?25l[2K[1Gtransferring model data ⠦ [?25h[?25l[2K[1Gtransferring model data ⠧ [?25h[?25l[2K[1Gtransferring model data ⠇ [?25h[?25l[2K[1Gtransferring model data ⠇ [?25h[?25l[2K[1Gtransferring model data ⠋ [?25h[?25l[2K[1Gtransferring model data ⠙ [?25h[?25l[2K[1Gtransferring model data ⠹ [?25h[?25l[2K[1Gtransferring model data ⠹ [?25h[?25l[2K[1Gtransferring model data ⠼ [?25h[?25l[2K[1Gtransferring model data ⠴ [?25h[?25l[2K[1Gtransferring model data ⠦ [?25h[?25l[2K[1Gtransferring model data ⠧ [?25h[?25l[2K[1Gtransferring model data ⠇ [?25h[?25l[2K[1Gtransferring model data ⠏ [?25h[?25l[2K[1Gtransferring model data ⠋ [?25h[?25l[2K[1Gtransferring model data ⠙ [?25h[?25l[2K[1Gtransferring model data ⠹ [

In [None]:
# Start running inference using eval dataset
%%capture
!pip install -U langchain-ollama

In [None]:
str_prompt = """
You are a financial advisor responsible for helping train an AI language model
to provide comprehensive, sound financial advice based on a company's financial
history.

You will be given a question about a company's financial state, investment opportunities, risk factors,
etc. Based on your expertise and ability, answer the question clearly, correctly and concisely. Do not
include any false or misleading information.

Below is an answer of an appropriate question-answer pair

<<Example>>

What is Meta's current dividend policy and how does it affect investors looking for dividends?

<<Your Example Answer:>> Meta Platforms, Inc. has never declared or paid any cash dividends on its common stock. The company intends to retain any future earnings to finance the operation and expansion of its business and fund its share repurchase program. As a result, investors looking for dividends will not receive any from Meta and will only receive a return on their investment if the trading price of their shares increases.

Please answer in plain text only. You're a conversational AI assistant, so be sure to speak
helpfully and naturally. Do not answer with just a question. Answer the query provided by the user

Here is your question: {query}
"""

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from tqdm import tqdm
import pandas as pd

prompt = ChatPromptTemplate.from_template(str_prompt)

model = OllamaLLM(model="ft_model_phi")

chain = prompt | model

eval_dataset = pd.read_csv('./evaluation_dataset.csv')

idno = 0
answers = []

for question in tqdm(eval_dataset["Question"]):
  output = chain.invoke({'query': question})
  answers.append({
      "no": idno,
      "question":question,
      "answer": output
  })
  idno += 1

final = pd.DataFrame(answers)
final.to_csv("phi-3-ft-output.csv", index=False)

100%|██████████| 1210/1210 [1:21:18<00:00,  4.03s/it]


In [None]:
#IMPORTANT: Download the files you made during the process
from google.colab import files
import shutil

files.download('phi-3-ft-output.csv')
shutil.make_archive('/content/ft_model', 'zip', 'content/ft_model')
files.download('/content/ft_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>