<a href="https://colab.research.google.com/github/piggyatbaqaqi/skol/blob/main/IST691/mistral_transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os, sys
import json
from pathlib import Path
import random
from typing import Any, Dict, List, Optional
import glob
from google.colab import drive
%cd /content
content = Path('/content')
skol = content / 'drive/My Drive/SKOL'
piggyatbaqaqi = skol / 'github.com/piggyatbaqaqi'
drive.mount(str(content / "drive"), force_remount=True)
skol_client = content / 'skol'
if not os.path.exists(skol_client):
  skol_client.symlink_to(piggyatbaqaqi / 'skol')
ist691 = skol_client / 'IST691'

sys.path.insert(0, str(piggyatbaqaqi / 'skol'))

/content
Mounted at /content/drive


In [3]:
# You only need to run this once per machine
try:
  import bitsandbytes
except ImportError:
  !pip install bitsandbytes
try:
  import transformers
except ImportError:
  !pip install transformers
try:
  import peft
except ImportError:
  !pip install peft
try:
  import accelerate
except ImportError:
  !pip install accelerate
try:
  import datasets
  import ipywidgets
  import fsspec
except ImportError:
  !pip install datasets scipy ipywidgets
  !pip install fsspec==2023.9.2


Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

### Set up git clients

In [4]:
if not os.path.exists(piggyatbaqaqi):
  %mkdir -p $piggyatbaqaqi
if not os.path.exists(piggyatbaqaqi / 'skol'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/skol.git
sys.path.insert(0, piggyatbaqaqi / 'skol')
if not os.path.exists(piggyatbaqaqi / 'dr-drafts-mycosearch'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/dr-drafts-mycosearch.git
workdir = skol / 'IST691'
%cd $workdir

/content/drive/My Drive/SKOL/IST691


In [5]:
prompt = '''Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.
'''
# prompt = '''Format the output as JSON.
# Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
# The innermost layer must be formated as lists of string-valued values..
# Translate. if any, Latin paragraphs to English.
# '''

eval_description = """Fungus anamorphicus. Coloniae in substrato naturali eﬀusae, nigrae. Mycelium
superﬁciale, ex hyphis ramosis, septatis, pallide brunneis vel brunneis, laevibus, 1.5–3
μm crassis compositum. Conidiophora nulla vel brevis, 1–3-septata, brunnea vel
atrobrunnea, 11–28 × 4.5–5 μm. Cellula conidiogena monoblastica, determinatae,
solitaria, simplicia, lageniformia vel ampulliformia, brunnea vel atrobrunnea, laevia,
4.5–6.5 × 3.5–5 μm, ad apicem 3–4.5 μm crassa et truncatae. Conidiorum secessio
schizolytica. Conidia holoblastica, solitaria, acrogena, recta vel curvata, obclavata vel
obclavata-rostrata, atrobrunnea vel brunnea, laevia, 13–19-distoseptata, 130–190 μm
longa, 7–9 μm crassa, apicem versus ad 2–3 μm attenuata; cellula apicalis rotundata;
cellula basalis cylindrica vel conico-truncata, ad basim 3.5–4.5 μm crassa; Appendicibus
lateralibus 0–2, brunneae, septata, cylindricae, surgentibus ex cellulla e apicem 2nd vel
3rd.

Anamorphic fungi. Colonies on natural substrate eﬀuse, black. Mycelium
superﬁcial, composed of branched, septate, pale brown to brown, smoothwalled hyphae, 1.5–3 μm thick. Conidiophores absent or short, 1–3-septate,
brown to dark brown, 11–28 × 4.5–5 μm. Conidiogenous cells monoblastic,
determinate, solitary, simple, lageniform or ampulliform, brown to dark brown,
smooth, 4.5–6.5 × 3.5–5 μm, 3–4.5 μm wide at the truncate apex. Conidial
secession schizolytic. Conidia holoblastic, solitary, acrogenous, straight or
curved, obclavate to obclavate-rostrate, dark brown to brown, smooth, 13–19distoseptate, 130–190 μm long, 7–9 μm thick in the broadest part, tapering
to 2–3 μm near the apex; apical cells rounded; basal cell cylindrical, truncate,
3.5–4.5 μm wide; lateral appendages 0–2, brown, septate, cylindrical, arising
from the 2nd or 3rd cells from the apex.
"""

### Set up SKOL-specific code

In [6]:
from finder import read_files, parse_annotated, target_classes
from label import Label
from taxon import Taxon, group_paragraphs

SEED=12345
default_label = Label('Misc-exposition')
keep_labels = [Label('Description'), Label('Nomenclature')]

In [7]:
raw_directory_path = skol / 'raw_2025_02_05/'
ann_directory_path = skol / 'annotated_2025_02_27/journals'

In [8]:
def make_prompt(prompt: str, description: str, result: Optional[str] = None) -> str:
  retval = f"""<s>[INST]{prompt}

Here is the description:
{description}[/INST]

Result:
"""
  if result is not None:
    retval += f"""
```json
{result}
```
</s>
"""
  return retval

In [9]:
# Function that reports all the txt files under a Google Drive folder path
def listFiles(folder: str) -> List[str]:
  # List all files in the folder
  try:
      files = [file for file in glob.glob(f'{folder}/**/*.txt*', recursive=True) if 'Sydowia' not in file]
      return files
  except FileNotFoundError:
      print(f"Folder '{folder}' not found.")
  except PermissionError:
      print(f"Permission denied to access folder '{folder}'.")

In [10]:
# check files in annotated directory
training_files = listFiles(ann_directory_path)
training_files[:10]

['/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol057/n1.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol054/n1.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s17.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s29.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s30.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s7.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s21.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s13.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s1.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s46.txt.ann']

In [11]:
paragraphs = list(parse_annotated(read_files(random.sample(training_files, 100))))
relabeled = list(target_classes(default=default_label, keep=keep_labels, paragraphs=paragraphs))

In [12]:
def load_json_training(filename: str) -> List[Dict[str, Any]]:
  retval = []
  def val(description: str, result: Dict[str, Any]) -> Dict[str, str]:
    return {'description': description,
            'result': json.dumps(result, indent=4, ensure_ascii=False)}
  state = 'START'  # 'description', 'result'
  with open(filename, "r", encoding="utf-8") as file:
    lines = []
    description = ''
    for line in file:
      if line.startswith('Send to LLM:'):
        if state == "result":
          result = ''.join(lines)
          try:
            result_dict = json.loads(result)
          except json.JSONDecodeError as err:
            print(f'Err: {err}\n{result}')

          retval.append(val(description, result_dict))
        lines = []
        state = 'description'
      elif line.startswith('Result:'):
        if state == "description":
          description = ''.join(lines)
          lines = []
        state = 'result'
      else:
        lines.append(line)
    if state == 'result' and len(lines) > 0:
      result = ''.join(lines)
      try:
        result_dict = json.loads(result)
      except json.JSONDecodeError as err:
        print(f'Err: {err}\n{result}')
      retval.append(val(description, result_dict))
  return retval


In [13]:
json_training = load_json_training(workdir / 'json_training.txt')
print(json_training[0])

{'description': '\n\nSaprobic on stems. Ascomata, superﬁcial, thyriothecial; in section conical,\nrelatively small, opening with a minute ﬂat or slightly papillate ostiole. Asci 8spored, bitunicate, ﬁssitunicate, cylindrical or obclavate. Ascospores, 1-septate,\nlight brown.\nAnamorphs: None reported for the genus (Hyde et al. 2011).\n\n\n', 'result': '{\n    "habit": [\n        "saprobic"\n    ],\n    "habitat": [\n        "on stems"\n    ],\n    "ascomata": [\n        "superficial",\n        "thyriothecial",\n        "conical",\n        "relatively small",\n        "opening with a minute flat or slightly papillate ostiole"\n    ],\n    "asci": {\n        "number of spores": [\n            "8-spored"\n        ],\n        "shape": [\n            "bitunicate",\n            "fissitunicate",\n            "cylindrical or obclavate"\n        ]\n    },\n    "ascospores": {\n        "septation": [\n            "1-septate"\n        ],\n        "color": [\n            "light brown"\n        ]\n

In [14]:
import datasets

dataset = datasets.Dataset.from_list(json_training)

new_dataset = datasets.Dataset.train_test_split(dataset, int(1), shuffle=False)
temp_dataset = new_dataset["train"]
test_dataset = new_dataset["test"]
new_dataset2 = datasets.Dataset.train_test_split(temp_dataset,int(1), shuffle=False)
train_dataset = new_dataset2["train"]
eval_dataset = new_dataset2["test"]

print(train_dataset, eval_dataset, test_dataset)

Dataset({
    features: ['description', 'result'],
    num_rows: 14
}) Dataset({
    features: ['description', 'result'],
    num_rows: 1
}) Dataset({
    features: ['description', 'result'],
    num_rows: 1
})


In [15]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)


In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
from google.colab import userdata
token = userdata.get('HF_TOKEN')

login(token=token)
base_model_id = "mistralai/Mistral-7B-Instruct-v0.3"
max_length = 2048

In [25]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)



config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [17]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=max_length,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [18]:

def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [19]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = make_prompt(prompt, data_point["description"], data_point["result"])
    return tokenize(full_prompt)


In [20]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)
tokenized_test_dataset = test_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [21]:
print(tokenized_train_dataset[4]['input_ids'])
print(len(tokenized_train_dataset[4]['input_ids']))
print("\n" + test_dataset[0]['description'])
print("Result: \n```json\n" + test_dataset[0]['result'] + "\n```\n")


[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [22]:
eval_prompt = make_prompt(prompt, eval_description)


In [23]:
print(eval_prompt)

<s>[INST]Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.


Here is the description:
Fungus anamorphicus. Coloniae in substrato naturali eﬀusae, nigrae. Mycelium
superﬁciale, ex hyphis ramosis, septatis, pallide brunneis vel brunneis, laevibus, 1.5–3
μm crassis compositum. Conidiophora nulla vel brevis, 1–3-septata, brunnea vel
atrobrunnea, 11–28 × 4.5–5 μm. Cellula conidiogena monoblastica, determinatae,
solitaria, simplicia, lageniformia vel ampulliformia, brunnea vel atrobrunnea, laevia,
4.5–6.5 × 3.5–5 μm, ad apicem 3–4

In [26]:
base_model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
base_model.eval()
with torch.no_grad():
    print(tokenizer.decode(base_model.generate(**base_model_input, max_new_tokens=max_length, pad_token_id=2)[0], skip_special_tokens=True))


Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.


Here is the description:
Fungus anamorphicus. Coloniae in substrato naturali eﬀusae, nigrae. Mycelium
superﬁciale, ex hyphis ramosis, septatis, pallide brunneis vel brunneis, laevibus, 1.5–3
μm crassis compositum. Conidiophora nulla vel brevis, 1–3-septata, brunnea vel
atrobrunnea, 11–28 × 4.5–5 μm. Cellula conidiogena monoblastica, determinatae,
solitaria, simplicia, lageniformia vel ampulliformia, brunnea vel atrobrunnea, laevia,
4.5–6.5 × 3.5–5 μm, ad apicem 3–4.5 μm cra

## Run the base model on an assortment of annotated files

grouped = group_paragraphs(relabeled)
with open(workdir / 'json_training3.txt', 'w', encoding='utf-8') as f:
  for i, tax in enumerate(grouped):
    eval_prompt = make_prompt(prompt, tax.as_row()["description"])
    base_model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
    base_model.eval()
    print(tax.as_row()["taxon"])
    with torch.no_grad():
        print(tokenizer.decode(base_model.generate(**base_model_input, max_new_tokens=max_length, pad_token_id=2)[0], skip_special_tokens=True), file=f)


In [None]:
from peft import prepare_model_for_kbit_training
base_model.gradient_checkpointing_enable()
base_model = prepare_model_for_kbit_training(base_model)
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )




In [27]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
base_model = get_peft_model(base_model, config)
print_trainable_parameters(base_model)
base_model = accelerator.prepare_model(base_model)



NameError: name 'print_trainable_parameters' is not defined

In [None]:
print(base_model)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj)

In [None]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    base_model.is_parallelizable = True
    base_model.model_parallel = True


In [None]:
!nvidia-smi

Thu Jun 19 14:12:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             57W /  400W |    7627MiB /  40960MiB |      1%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## Fine tune the model

import transformers
from datetime import datetime
import wandb
project = "skol-finetune"
num_steps = 250
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
tokenizer.pad_token = tokenizer.eos_token
wandb_token = userdata.get('WANDB_TOKEN')
wandb.login(key=wandb_token)
trainer = transformers.Trainer(
    model=base_model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        # max_steps=1000,
        max_steps=num_steps,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=10,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=10,                # Save checkpoints every 50 steps
        eval_strategy="steps", # Evaluate the model every logging step
        eval_steps=10,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
base_model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
base_tokenizer.pad_token = tokenizer.eos_token




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(eval_prompt)
print(prompt)

<s>[IST]Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.


Here is the description:
Original description (from Sharma & Wright 1992 with our additions within
square brackets) — “Fruitbody annual, centrally stipitate, coriaceous soft
when fresh, hard and rigid on drying. Pileus infundibuliform or depressed at
centre, circular, 5–8 mm diam., 1–1.5 mm thick, ﬁnely depressed velutinate [to
glabrous when dry], deep golden brown to cinnamon brown, shiny to glossy,
narrowly concentrically zonate. Margin sharp, deﬂexed when dry, ci

In [None]:
eval_prompt = make_prompt(prompt, eval_description)

In [None]:
from peft import PeftModel
ft_model = PeftModel.from_pretrained(base_model, f"mistral-skol-finetune/checkpoint-{num_steps}")
ft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**base_model_input, max_new_tokens=max_length, pad_token_id=2)[0], skip_special_tokens=True))


[IST]Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.


Here is the description:
Original description (from Sharma & Wright 1992 with our additions within
square brackets) — “Fruitbody annual, centrally stipitate, coriaceous soft
when fresh, hard and rigid on drying. Pileus infundibuliform or depressed at
centre, circular, 5–8 mm diam., 1–1.5 mm thick, ﬁnely depressed velutinate [to
glabrous when dry], deep golden brown to cinnamon brown, shiny to glossy,
narrowly concentrically zonate. Margin sharp, deﬂexed when dry, cilia

## Save the finetuned model and tokenizer


In [None]:
ft_model.save_pretrained(str(ist691 / 'mistral-skol-finetune'))
tokenizer.save_pretrained(str(ist691 / 'mistral-skol-finetune'))



('/content/skol/IST691/mistral-skol-finetune/tokenizer_config.json',
 '/content/skol/IST691/mistral-skol-finetune/special_tokens_map.json',
 '/content/skol/IST691/mistral-skol-finetune/chat_template.jinja',
 '/content/skol/IST691/mistral-skol-finetune/tokenizer.model',
 '/content/skol/IST691/mistral-skol-finetune/added_tokens.json',
 '/content/skol/IST691/mistral-skol-finetune/tokenizer.json')

## Prompt engineering cell
Currently disabled.

manual_prompt = '''Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.
'''

manual_description = """Fungus anamorphicus. Coloniae in substrato naturali eﬀusae, nigrae. Mycelium
superﬁciale, ex hyphis ramosis, septatis, pallide brunneis vel brunneis, laevibus, 1.5–3
μm crassis compositum. Conidiophora nulla vel brevis, 1–3-septata, brunnea vel
atrobrunnea, 11–28 × 4.5–5 μm. Cellula conidiogena monoblastica, determinatae,
solitaria, simplicia, lageniformia vel ampulliformia, brunnea vel atrobrunnea, laevia,
4.5–6.5 × 3.5–5 μm, ad apicem 3–4.5 μm crassa et truncatae. Conidiorum secessio
schizolytica. Conidia holoblastica, solitaria, acrogena, recta vel curvata, obclavata vel
obclavata-rostrata, atrobrunnea vel brunnea, laevia, 13–19-distoseptata, 130–190 μm
longa, 7–9 μm crassa, apicem versus ad 2–3 μm attenuata; cellula apicalis rotundata;
cellula basalis cylindrica vel conico-truncata, ad basim 3.5–4.5 μm crassa; Appendicibus
lateralibus 0–2, brunneae, septata, cylindricae, surgentibus ex cellulla e apicem 2nd vel
3rd.

Anamorphic fungi. Colonies on natural substrate eﬀuse, black. Mycelium
superﬁcial, composed of branched, septate, pale brown to brown, smoothwalled hyphae, 1.5–3 μ
m thick. Conidiophores absent or short, 1–3-septate,
brown to dark brown, 11–28 × 4.5–5 μm. Conidiogenous cells monoblastic,
determinate, solitary, simple, lageniform or ampulliform, brown to dark brown,
smooth, 4.5–6.5 × 3.5–5 μm, 3–4.5 μm wide at the truncate apex. Conidial
secession schizolytic. Conidia holoblastic, solitary, acrogenous, straight or
curved, obclavate to obclavate-rostrate, dark brown to brown, smooth, 13–19distoseptate, 130–190 μm long, 7–9 μm thick in the broadest part, tapering
to 2–3 μm near the apex; apical cells rounded; basal cell cylindrical, truncate,
3.5–4.5 μm wide; lateral appendages 0–2, brown, septate, cylindrical, arising
from the 2nd or 3rd cells from the apex.
"""

manual_eval_prompt = make_prompt(manual_prompt, manual_description)


manual_base_model_input = tokenizer(manual_eval_prompt, return_tensors="pt").to("cuda")
base_model.eval()
with torch.no_grad():
    print(tokenizer.decode(base_model.generate(**manual_base_model_input, max_new_tokens=max_length, pad_token_id=2)[0], skip_special_tokens=True))


In [28]:
import json
import numpy as np
from numpy.linalg import norm

def key_value_sets(json_obj):
  keys = set()
  values = set()
  if isinstance(json_obj, dict):
    for key, value in json_obj.items():
      keys.add(key)
      subkeys, subvalues = key_value_sets(value)
      keys.update(subkeys)
      values.update(subvalues)
  elif isinstance(json_obj, list):
    for value in json_obj:
      subkeys, subvalues = key_value_sets(value)
      keys.update(subkeys)
      values.update(subvalues)
  elif isinstance(json_obj, str):
    values.add(json_obj)
  else:
    values.add(str(json_obj))

  return (keys,values)

# Credit to Google search of "jaccard distance set python" which generated this code.
def jaccard_distance(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    jaccard_index = len(intersection) / len(union)
    jaccard_distance = 1 - jaccard_index
    return jaccard_distance

def jaccard_distance_json(json1, json2):
    """
    Calculates the jaccard distance between two JSON objects.

    We compare the set of keys and subkeys between the two JSON objects, and
    the set of values between the two JSON objects, and return the average of
    the jaccard distance between the two sets.

    Args:
        json1 (dict): The first JSON object.
        json2 (dict): The second JSON object.

    Returns:
        float: The jaccard distance between the two JSON objects.
    """

    json1_keys, json1_vals = key_value_sets(json1)
    json2_keys, json2_vals = key_value_sets(json2)
    j_key = jaccard_distance(json1_keys, json2_keys)
    j_val = jaccard_distance(json1_vals, json2_vals)

    avg_jaccard_distance = (j_key + j_val)/2

    return avg_jaccard_distance

# Example Usage
json_training_0 = json.loads(json_training[0]['result'])
json_training_1 = json.loads(json_training[1]['result'])
#print(json_training[0]['result'])
#print(json_training[1]["result"])
print(jaccard_distance_json(json_training_0, json_training_0))
print(jaccard_distance_json(json_training_0, json_training_1))


0.0
0.9131313131313131


In [29]:
import io

def extract_json(md: str) -> Dict[str, Any]:
  """
  Extracts the first JSON object from a string of Markdown.
  """
  state = "START"
  lines = []
  with io.StringIO(md) as f:
    for line in f:
      if line.startswith('```json') or line.startswith("result:"):
        state = "RECORDING"
      elif line.startswith('```'):
        state = "END"
        return json.loads("\n".join(lines))
      elif line.startswith("}"):
        lines.append(line)
        state = "END"
        return json.loads("\n".join(lines))
      elif state == "RECORDING":
        lines.append(line)
  return json.loads("\n".join(lines))



In [30]:
test_description = test_dataset[0]['description']
ground_truth = json.loads(test_dataset[0]['result'].lower())
test_prompt = make_prompt(prompt, test_description)
print(f'test_prompt:\n{test_prompt}')

test_prompt:
<s>[INST]Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.


Here is the description:

Synnemata 200-337.5 µm longa, ex stromatibus oriunda. Stromata bene evoluta, globosa,
subglobosa vel irregularia, atrobrunnea. 50-72 µm diam., ex cellulis oblongis, cylindraceis
composita. Conidiophora 130-263 × 5-6 µm, modice brunnea vel atrobrunnea,
crassitunicata, pluriseptata. Conidia solitaria, sicca, in multitudine atro-brunnea, 25-57.5
× 5.5-8 µm, laevia, raro verruculosa, crassitunicata, 2-7 septis crassis, basi trunca

In [31]:
base_model_input = tokenizer(test_prompt, return_tensors="pt").to("cuda")
base_model.eval()
base_model_id = "mistralai/Mistral-7B-Instruct-v0.3"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)
base_model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [32]:
ft_path = str(ist691 / 'mistral-skol-finetune')
ft_tokenizer = AutoTokenizer.from_pretrained(ft_path, trust_remote_code=True)
ft_model_input = ft_tokenizer(test_prompt, return_tensors="pt").to("cuda")
ft_model = AutoModelForCausalLM.from_pretrained(ft_path, quantization_config=bnb_config)
ft_model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
 

In [33]:
with torch.no_grad():
  base_model_output = tokenizer.decode(base_model.generate(**base_model_input, max_new_tokens=max_length, pad_token_id=2)[0], skip_special_tokens=True).lower()
  ft_model_output = tokenizer.decode(ft_model.generate(**ft_model_input, max_new_tokens=max_length, pad_token_id=2)[0], skip_special_tokens=True).lower()
  print(f'base_model_output:\n{base_model_output}')
  print(f'ft_model_output:\n{ft_model_output}')

base_model_output:
please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
format the output as json.
the top level of the json is feature names. the next level in is subfeature names . the optional next level in is subsubfeature names.
the innermost layer is lists of string-valued values.
lists are only present at the innermost level of the json.
feature values that are comma-separated strings should be broken down into separate values.
translate latin paragraphs to english.


here is the description:

synnemata 200-337.5 µm longa, ex stromatibus oriunda. stromata bene evoluta, globosa,
subglobosa vel irregularia, atrobrunnea. 50-72 µm diam., ex cellulis oblongis, cylindraceis
composita. conidiophora 130-263 × 5-6 µm, modice brunnea vel atrobrunnea,
crassitunicata, pluriseptata. conidia solitaria, sicca, in multitudine atro-brunnea, 25-57.5
× 5.5-8 µm, laevia, raro verruculosa, crassitunicata, 2-7 septis crassis, basi truncate.

In [34]:
base_model_result = extract_json(base_model_output)
ft_model_result = extract_json(ft_model_output)

In [38]:
print("Base Model Result")
print(type(base_model_result))
print(json.dumps(base_model_result, indent=4, ensure_ascii=False))
print("Fine-Tuned Model Result")
print(type(ft_model_result))
print(json.dumps(ft_model_result, indent=4, ensure_ascii=False))
print("Ground Truth")
print(type(ground_truth))
print(json.dumps(ground_truth, indent=4, ensure_ascii=False))

Base Model Result
<class 'dict'>
{
    "speciesdescription": [
        {
            "leafspots": [
                {
                    "location": "amphigenous",
                    "shape": [
                        "circular",
                        "angular",
                        "irregular"
                    ],
                    "pattern": [
                        "dark brown",
                        "pale brown"
                    ],
                    "size": [
                        "1-3 cm diam."
                    ]
                }
            ],
            "conidiomata": [
                {
                    "type": [
                        "synnematous",
                        "caespituli"
                    ],
                    "origin": [
                        "hypogenous",
                        "amphigenous"
                    ],
                    "color": [
                        "blackish brown"
                    ],
                 

In [36]:
print("Base Model Jaccard Distance")
print(jaccard_distance_json(base_model_result, ground_truth))
print("Fine-Tuned Model Jaccard Distance")
print(jaccard_distance_json(ft_model_result, ground_truth))
#

Base Model Jaccard Distance
0.7620967741935484
Fine-Tuned Model Jaccard Distance
0.7987341772151899
