<a href="https://colab.research.google.com/github/piggyatbaqaqi/skol/blob/main/IST691/mistral_transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, sys
import json
from pathlib import Path
import random
from typing import Any, Dict, List
import glob
from google.colab import drive
%cd /content
content = Path('/content')
skol = content / 'drive/My Drive/SKOL'
piggyatbaqaqi = skol / 'github.com/piggyatbaqaqi'
drive.mount(str(content / "drive"), force_remount=True)
cache_path = content / 'cache'
ollama_cache_path = content / 'ollama_cache'
# nb_path = content / 'packages_mistral'
# if not os.path.exists(nb_path):
#   nb_path.symlink_to(skol / 'packages_mistral')
skol_client = content / 'skol'
if not os.path.exists(skol_client):
  skol_client.symlink_to(piggyatbaqaqi / 'skol')
if not os.path.exists(cache_path):
  cache_path.symlink_to(skol / 'pip_cache')
if not os.path.exists(ollama_cache_path):
  ollama_cache_path.symlink_to(skol / 'ollama_cache')
os.environ['OLLAMA_MODELS'] = str(ollama_cache_path)
ist691 = skol_client / 'IST691'

# sys.path.insert(0, str(nb_path))
sys.path.insert(0, str(piggyatbaqaqi / 'skol'))

/content
Mounted at /content/drive


In [2]:
# You only need to run this once per machine
try:
  import bitsandbytes
except ImportError:
  !pip install bitsandbytes
try:
  import transformers
except ImportError:
  !pip install transformers
try:
  import peft
except ImportError:
  !pip install peft
try:
  import accelerate
except ImportError:
  !pip install accelerate
try:
  import datasets
  import ipywidgets
  import fsspec
except ImportError:
  !pip install datasets scipy ipywidgets
  !pip install fsspec==2023.9.2


Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

### Set up git clients

In [3]:
if not os.path.exists(piggyatbaqaqi):
  %mkdir -p $piggyatbaqaqi
if not os.path.exists(piggyatbaqaqi / 'skol'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/skol.git
sys.path.insert(0, piggyatbaqaqi / 'skol')
if not os.path.exists(piggyatbaqaqi / 'dr-drafts-mycosearch'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/dr-drafts-mycosearch.git
workdir = skol / 'IST691'
%cd $workdir

/content/drive/My Drive/SKOL/IST691


### Set up SKOL-specific code

In [4]:
from finder import read_files, parse_annotated, target_classes
from label import Label
from taxon import Taxon, group_paragraphs

SEED=12345
default_label = Label('Misc-exposition')
keep_labels = [Label('Description'), Label('Nomenclature')]

In [5]:
raw_directory_path = skol / 'raw_2025_02_05/'
ann_directory_path = skol / 'annotated_2025_02_27/journals'

In [None]:
# Function that reports all the txt files under a Google Drive folder path
def listFiles(folder: str) -> List[str]:
  # List all files in the folder
  try:
      files = [file for file in glob.glob(f'{folder}/**/*.txt*', recursive=True) if 'Sydowia' not in file]
      return files
  except FileNotFoundError:
      print(f"Folder '{folder}' not found.")
  except PermissionError:
      print(f"Permission denied to access folder '{folder}'.")

In [None]:
# check files in annotated directory
training_files = listFiles(ann_directory_path)
training_files[:10]

In [None]:
paragraphs = list(parse_annotated(read_files(random.sample(training_files, 20))))
relabeled = list(target_classes(default=default_label, keep=keep_labels, paragraphs=paragraphs))

In [None]:
prompt = '''Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.
'''
# prompt = '''Format the output as JSON.
# Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
# The innermost layer must be formated as lists of string-valued values..
# Translate. if any, Latin paragraphs to English.
# '''


description = """Fungus anamorphicus. Coloniae in substrato naturali eﬀusae, nigrae. Mycelium
superﬁciale, ex hyphis ramosis, septatis, pallide brunneis vel brunneis, laevibus, 1.5–3
μm crassis compositum. Conidiophora nulla vel brevis, 1–3-septata, brunnea vel
atrobrunnea, 11–28 × 4.5–5 μm. Cellula conidiogena monoblastica, determinatae,
solitaria, simplicia, lageniformia vel ampulliformia, brunnea vel atrobrunnea, laevia,
4.5–6.5 × 3.5–5 μm, ad apicem 3–4.5 μm crassa et truncatae. Conidiorum secessio
schizolytica. Conidia holoblastica, solitaria, acrogena, recta vel curvata, obclavata vel
obclavata-rostrata, atrobrunnea vel brunnea, laevia, 13–19-distoseptata, 130–190 μm
longa, 7–9 μm crassa, apicem versus ad 2–3 μm attenuata; cellula apicalis rotundata;
cellula basalis cylindrica vel conico-truncata, ad basim 3.5–4.5 μm crassa; Appendicibus
lateralibus 0–2, brunneae, septata, cylindricae, surgentibus ex cellulla e apicem 2nd vel
3rd.

Anamorphic fungi. Colonies on natural substrate eﬀuse, black. Mycelium
superﬁcial, composed of branched, septate, pale brown to brown, smoothwalled hyphae, 1.5–3 μm thick. Conidiophores absent or short, 1–3-septate,
brown to dark brown, 11–28 × 4.5–5 μm. Conidiogenous cells monoblastic,
determinate, solitary, simple, lageniform or ampulliform, brown to dark brown,
smooth, 4.5–6.5 × 3.5–5 μm, 3–4.5 μm wide at the truncate apex. Conidial
secession schizolytic. Conidia holoblastic, solitary, acrogenous, straight or
curved, obclavate to obclavate-rostrate, dark brown to brown, smooth, 13–19distoseptate, 130–190 μm long, 7–9 μm thick in the broadest part, tapering
to 2–3 μm near the apex; apical cells rounded; basal cell cylindrical, truncate,
3.5–4.5 μm wide; lateral appendages 0–2, brown, septate, cylindrical, arising
from the 2nd or 3rd cells from the apex.
"""

In [None]:
def load_json_training(filename: str) -> List[Dict[str, Any]]:
  retval = []
  state = 'START'  # 'description', 'result'
  with open(filename, "r", encoding="utf-8") as file:
    lines = []
    description = ''
    for line in file:
      if line.startswith('Send to LLM:'):
        if state == "result":
          result = ''.join(lines)
          try:
            result_dict = json.loads(result)
          except json.JSONDecodeError as err:
            print(f'Err: {err}\n{result}')
          retval.append({'description': description, 'result': json.dumps(result_dict)})
        lines = []
        state = 'description'
      elif line.startswith('Result:'):
        if state == "description":
          description = ''.join(lines)
          lines = []
        state = 'result'
      else:
        lines.append(line)
    if state == 'result' and len(lines) > 0:
      result = ''.join(lines)
      try:
        result_dict = json.loads(result)
      except json.JSONDecodeError as err:
        print(f'Err: {err}\n{result}')
      retval.append({'description': description, 'result': json.dumps(result_dict)})
  return retval


In [None]:
json_training = load_json_training(workdir / 'json_training.txt')
print(json_training[0])

In [None]:
import datasets

dataset = datasets.Dataset.from_list(json_training)

new_dataset = datasets.Dataset.train_test_split(dataset,int(1))
temp_dataset = new_dataset["train"]
test_dataset = new_dataset["test"]
new_dataset2 = datasets.Dataset.train_test_split(temp_dataset,int(1))
train_dataset = new_dataset2["train"]
eval_dataset = new_dataset2["test"]

print(train_dataset, eval_dataset, test_dataset)

In [None]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
base_model_id = "mistralai/Mistral-7B-Instruct-v0.3"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)



In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=1024,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token


In [None]:

def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=1024,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""<s>[INST]{prompt}
Here is the description:
{data_point["description"]}[/INST]
Result:
{data_point["result"]}</s>
"""
    return tokenize(full_prompt)


In [None]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)
tokenized_test_dataset = test_dataset.map(generate_and_tokenize_prompt)

In [None]:
print(tokenized_train_dataset[4]['input_ids'])
print(len(tokenized_train_dataset[4]['input_ids']))
print("\n" + test_dataset[0]['description'])
print("Result: " + test_dataset[0]['result'] + "\n")


In [None]:
eval_prompt = f"""<s>[INST]{prompt}

Here is the description:
{description}[/INST]
Result:
"""



In [None]:
print(eval_prompt)

In [None]:
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=2048, pad_token_id=2)[0], skip_special_tokens=True))


In [None]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )




In [None]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
model = accelerator.prepare_model(model)



trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705


In [None]:
print(model)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj)

In [None]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True


In [None]:
import transformers
from datetime import datetime
project = "skol-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        # max_steps=1000,
        max_steps=50,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=5,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=5,                # Save checkpoints every 50 steps
        eval_strategy="steps", # Evaluate the model every logging step
        eval_steps=5,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpiggy-yarroll[0m ([33mpiggy-yarroll-carnegie-mellon-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
50,0.8603,0.845157




TrainOutput(global_step=50, training_loss=0.8603462219238281, metrics={'train_runtime': 7629.6929, 'train_samples_per_second': 0.052, 'train_steps_per_second': 0.007, 'total_flos': 1.53364815937536e+16, 'train_loss': 0.8603462219238281, 'epoch': 25.0})

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from peft import PeftModel
ft_model = PeftModel.from_pretrained(base_model, "mistral-skol-finetune/checkpoint-50")
ft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100, pad_token_id=2)[0], skip_special_tokens=True))


Format the output as JSON.
Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
The innermost layer must be formated as lists of string-valued values..
Translate. if any, Latin paragraphs to English.


Fungus anamorphicus. Coloniae in substrato naturali eﬀusae, nigrae. Mycelium
superﬁciale, ex hyphis ramosis, septatis, pallide brunneis vel brunneis, laevibus, 1.5–3
μm crassis compositum. Conidiophora nulla vel brevis, 1–3-septata, brunnea vel
atrobrunnea, 11–28 × 4.5–5 μm. Cellula conidiogena monoblastica, determinatae,
solitaria, simplicia, lageniformia vel ampulliformia, brunnea vel atrobrunnea, laevia,
4.5–6.5 × 3.5–5 μm, ad apicem 3–4.5 μm crassa et truncatae. Conidiorum secessio
schizolytica. Conidia holoblastica, solitaria, acrogena, recta vel curvata, obclavata vel
obclavata-rostrata, atrobrunnea vel brunnea, laevia, 13–19-distoseptata, 130–190 μm
longa, 7–9 μm crassa, apicem versus ad 2–3 μm attenuata; cellula apicali

In [None]:
ft_model.save_pretrained(str(ist691 / 'mistral-skol-finetune'))
tokenizer.save_pretrained(str(ist691 / 'mistral-skol-finetune'))



('/content/skol/IST691/mistral-skol-finetune/tokenizer_config.json',
 '/content/skol/IST691/mistral-skol-finetune/special_tokens_map.json',
 '/content/skol/IST691/mistral-skol-finetune/tokenizer.model',
 '/content/skol/IST691/mistral-skol-finetune/added_tokens.json',
 '/content/skol/IST691/mistral-skol-finetune/tokenizer.json')

In [None]:
prompt = '''Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.
'''
prompt = '''Format the output as JSON.
Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
The innermost layer must be formated as lists of string-valued values..
Translate. if any, Latin paragraphs to English.
'''


description = """Fungus anamorphicus. Coloniae in substrato naturali eﬀusae, nigrae. Mycelium
superﬁciale, ex hyphis ramosis, septatis, pallide brunneis vel brunneis, laevibus, 1.5–3
μm crassis compositum. Conidiophora nulla vel brevis, 1–3-septata, brunnea vel
atrobrunnea, 11–28 × 4.5–5 μm. Cellula conidiogena monoblastica, determinatae,
solitaria, simplicia, lageniformia vel ampulliformia, brunnea vel atrobrunnea, laevia,
4.5–6.5 × 3.5–5 μm, ad apicem 3–4.5 μm crassa et truncatae. Conidiorum secessio
schizolytica. Conidia holoblastica, solitaria, acrogena, recta vel curvata, obclavata vel
obclavata-rostrata, atrobrunnea vel brunnea, laevia, 13–19-distoseptata, 130–190 μm
longa, 7–9 μm crassa, apicem versus ad 2–3 μm attenuata; cellula apicalis rotundata;
cellula basalis cylindrica vel conico-truncata, ad basim 3.5–4.5 μm crassa; Appendicibus
lateralibus 0–2, brunneae, septata, cylindricae, surgentibus ex cellulla e apicem 2nd vel
3rd.

Anamorphic fungi. Colonies on natural substrate eﬀuse, black. Mycelium
superﬁcial, composed of branched, septate, pale brown to brown, smoothwalled hyphae, 1.5–3 μm thick. Conidiophores absent or short, 1–3-septate,
brown to dark brown, 11–28 × 4.5–5 μm. Conidiogenous cells monoblastic,
determinate, solitary, simple, lageniform or ampulliform, brown to dark brown,
smooth, 4.5–6.5 × 3.5–5 μm, 3–4.5 μm wide at the truncate apex. Conidial
secession schizolytic. Conidia holoblastic, solitary, acrogenous, straight or
curved, obclavate to obclavate-rostrate, dark brown to brown, smooth, 13–19distoseptate, 130–190 μm long, 7–9 μm thick in the broadest part, tapering
to 2–3 μm near the apex; apical cells rounded; basal cell cylindrical, truncate,
3.5–4.5 μm wide; lateral appendages 0–2, brown, septate, cylindrical, arising
from the 2nd or 3rd cells from the apex.
"""

eval_prompt = f"""{prompt}

{description}
Result:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=2048, pad_token_id=2)[0], skip_special_tokens=True))
