<a href="https://colab.research.google.com/github/piggyatbaqaqi/skol/blob/main/IST691/workspace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SKOL III: Feature extraction

created by:
* La Monte Yarroll
* Padmaja Kurmaddali
* Patrick Le

# **Mapping Data Files: Google Drive**

In [1]:
import os, sys
from pathlib import Path
from google.colab import drive
%cd /content
content = Path('/content')
skol = content / 'drive/My Drive/SKOL'
piggyatbaqaqi = skol / 'github.com/piggyatbaqaqi'
drive.mount(str(content / "drive"), force_remount=True)
cache_path = content / 'cache'
ollama_cache_path = content / 'ollama_cache'
nb_path = content / 'packages'
if not os.path.exists(nb_path):
  nb_path.symlink_to(skol / 'packages')
skol_client = content / 'skol'
if not os.path.exists(skol_client):
  skol_client.symlink_to(piggyatbaqaqi / 'skol')
if not os.path.exists(cache_path):
  cache_path.symlink_to(skol / 'pip_cache')
if not os.path.exists(ollama_cache_path):
  ollama_cache_path.symlink_to(skol / 'ollama_cache')
os.environ['OLLAMA_MODELS'] = str(ollama_cache_path)
sys.path.insert(0, str(nb_path))
sys.path.insert(0, str(piggyatbaqaqi / 'skol'))pip

/content
Mounted at /content/drive


In [2]:
!ls -l /content/
!file /content
print(sys.path)

total 8
lrwxrwxrwx 1 root root   38 Jun 12 14:09 cache -> '/content/drive/My Drive/SKOL/pip_cache'
drwx------ 7 root root 4096 Jun 12 14:09 drive
lrwxrwxrwx 1 root root   41 Jun 12 14:09 ollama_cache -> '/content/drive/My Drive/SKOL/ollama_cache'
lrwxrwxrwx 1 root root   37 Jun 12 14:09 packages -> '/content/drive/My Drive/SKOL/packages'
drwxr-xr-x 1 root root 4096 Jun 10 13:39 sample_data
lrwxrwxrwx 1 root root   58 Jun 12 14:09 skol -> '/content/drive/My Drive/SKOL/github.com/piggyatbaqaqi/skol'
/content: directory
['/content/drive/My Drive/SKOL/github.com/piggyatbaqaqi/skol', '/content/packages', '/content', '/env/python', '/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '', '/usr/local/lib/python3.11/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.11/dist-packages/IPython/extensions', '/usr/local/lib/python3.11/dist-packages/setuptools/_vendor', '/root/.ipython']


## Set up git clients

In [3]:
if not os.path.exists(piggyatbaqaqi):
  %mkdir -p $piggyatbaqaqi
if not os.path.exists(piggyatbaqaqi / 'skol'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/skol.git
sys.path.insert(0, piggyatbaqaqi / 'skol')
if not os.path.exists(piggyatbaqaqi / 'dr-drafts-mycosearch'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/dr-drafts-mycosearch.git
workdir = skol / 'IST691'
%cd $workdir

/content/drive/My Drive/SKOL/IST691


In [4]:
# install PySpark
! pip install --cache-dir=$cache_path --target=$nb_path pyspark
! pip install --cache-dir=$cache_path --target=$nb_path sparknlp ollama

Collecting pyspark
  Using cached pyspark-4.0.0-py2.py3-none-any.whl
Collecting py4j==0.10.9.9 (from pyspark)
  Using cached py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Using cached py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Installing collected packages: py4j, pyspark
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dataproc-spark-connect 0.7.5 requires pyspark[connect]~=3.5.1, but you have pyspark 4.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed py4j-0.10.9.9 pyspark-4.0.0
[0mCollecting sparknlp
  Using cached sparknlp-1.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting ollama
  Using cached ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Collecting spark-nlp (from sparknlp)
  Downloading spark_nlp-6.0.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting numpy (from sparknlp)
  Using cached numpy-2.3.0-cp311-cp311-manylinux_2_28_x

In [None]:
#import needed modules
import os
import glob
import json
from typing import Any, Dict, List
from pathlib import Path
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.functions import (
    input_file_name, collect_list, concat_ws, col, udf,
    explode, collect_list, regexp_extract, regexp_replace,
    split, flatten, transform, concat)
from pyspark.sql.types import ArrayType, StringType

from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import (
    Tokenizer, HashingTF, IDF, StringIndexer, CountVectorizer,
    PCA, VectorAssembler)
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import split, row_number, min, expr, struct
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector
from pyspark.sql.types import DoubleType, StructField, StructType

SKOL Data manipulation library

In [None]:
from finder import read_files, parse_annotated, target_classes
from label import Label
from taxon import Taxon, group_paragraphs

SEED=12345
default_label = Label('Misc-exposition')
keep_labels = [Label('Description'), Label('Nomenclature')]

# **Checking the file counts in the directories**

In [None]:
raw_directory_path = skol / 'raw_2025_02_05/'
ann_directory_path = skol / 'annotated_2025_02_27/journals'

## Checking the file counts in the directories

In [None]:
# Function that reports all the txt files under a Google Drive folder path
def listFiles(folder: str) -> List[str]:
  # List all files in the folder
  try:
      files = [file for file in glob.glob(f'{folder}/**/*.txt*', recursive=True) if 'Sydowia' not in file]
      return files
  except FileNotFoundError:
      print(f"Folder '{folder}' not found.")
  except PermissionError:
      print(f"Permission denied to access folder '{folder}'.")

In [None]:
# check files in raw directory
listFiles(raw_directory_path)[:10]

In [None]:
# check files in annotated directory
training_files = listFiles(ann_directory_path)
training_files[:10]

In [None]:
len(training_files)

In [None]:
paragraphs = list(parse_annotated(read_files(random.sample(training_files, 20))))

In [None]:
relabeled = list(target_classes(default=default_label, keep=keep_labels, paragraphs=paragraphs))

In [None]:
print(f'len(relabeled): {len(relabeled)}')

In [None]:
df = pd.DataFrame({
    'filename': [pp.filename for pp in relabeled],
    'label': [pp.top_label().label if pp.top_label() else None for pp in relabeled],
    'paragraph_number': [pp.paragraph_number for pp in relabeled],
    'page_number': [pp.page_number for pp in relabeled],
    'empirical_page_number': [pp.empirical_page_number for pp in relabeled],
    'line_number': [pp.first_line.line_number if pp.first_line else None for pp in relabeled],
    'body': [str(pp) for pp in relabeled]
})
df.label = pd.Categorical(df.label)
df['label_code'] = df.label.cat.codes

In [None]:
df.groupby('label', observed=True).nunique()

In [None]:
#load in library to open terminal inside google colab
# !pip install  --cache-dir=$cache_path --target=$nb_path colab-xterm
# %load_ext colabxterm


In [None]:
!ls /usr/local/lib/ollama

In [None]:
import subprocess
import time

import ollama
#open up terminal
# %xterm
#once open, first time do:
if not os.path.exists("/usr/local/lib/ollama"):
  !curl https://ollama.ai/install.sh | sh
ollama_server = subprocess.Popen(["ollama", "serve"])
time.sleep(5)  # Let the server come all the way up.
#then start the server with ollama serve &
#first time also will need to pull in a ollama version using ollama pull mistral
# preferred_model = 'gemma3:12b'
preferred_model = 'mistral'
found = False
for _, l in ollama.list():
  for m in l:
    if m.model.startswith(preferred_model):
      found = True
      break
if not found:
  print(f"Pulling model {preferred_model}")
  !ollama pull $preferred_model


In [None]:
#check here if ollama has a version and can be used, will say model you pulled
!ollama list
# !ollama pull mistral
# !ollama pull gemma3:12b

In [None]:
    import ollama

    response = ollama.generate(
        model=preferred_model,
        prompt="What is the capital of France?"
    )
    print(response["response"])

In [None]:
#test if ollama call works
import ollama
response = ollama.generate(model=preferred_model, prompt='Why is the sky blue?')
print(response['response'])

In [None]:
grouped = group_paragraphs(relabeled)
prompt = '''Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Translate Latin paragraphs to English before any other processing.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
'''
for i, tax in enumerate(grouped):
  # if i > 10:
  #   break
  print(f'Send to LLM:\n\n{tax.as_row()["description"]}')
  #sample message to turn data into json format
  response = ollama.chat(model=preferred_model, messages=[{
     'role': 'user',
     'content': f'{prompt}\n\n{tax.as_row()["description"]}',},
  ])
  print('Result:')
  print(response['message']['content'])

In [None]:
# Initial implementation came from a Google search of
# "cosine similarity of two JSON objects python".
# We need to adjust this to handle a 3 level structure of
# (feature, subfeature, value), where the values may be list based, and
# may be categorical. We need to build a dictionary of known features,
# subfeatures, and values to be used to assign numerical values.
import json
import numpy as np
from numpy.linalg import norm

def cosine_similarity_json(json1, json2):
    """
    Calculates the cosine similarity between two JSON objects.

    Args:
        json1 (dict): The first JSON object.
        json2 (dict): The second JSON object.

    Returns:
        float: The cosine similarity between the two JSON objects.
    """
    all_keys = set(json1.keys()) | set(json2.keys())

    vector1 = np.array([json1.get(key, 0) for key in all_keys])
    vector2 = np.array([json2.get(key, 0) for key in all_keys])

    if not np.any(vector1) or not np.any(vector2):
      return 0  # handle zero vector case

    return np.dot(vector1, vector2) / (norm(vector1) * norm(vector2))

# Example Usage
json_string1 = '{"a": 1, "b": 2, "c": 3}'
json_string2 = '{"b": 4, "c": 5, "d": 6}'

json_object1 = json.loads(json_string1)
json_object2 = json.loads(json_string2)

similarity = cosine_similarity_json(json_object1, json_object2)
print(f"Cosine similarity: {similarity}")

In [None]:
prompt = '''Please extract features, subfeatures, optional subsubfeatures, and values from the following species description.
Format the output as JSON.
The top level of the JSON is feature names. The next level in is subfeature names . The optional next level in is subsubfeature names.
The innermost layer is lists of string-valued values.
Lists are only present at the innermost level of the JSON.
Feature values that are comma-separated strings should be broken down into separate values.
Translate Latin paragraphs to English.
'''

### The JSON keys are by feature, further broken down by subfeature (make sure to distinguish Type from Shape) and further broken down by optional subsubfeature, with lists of string values at the innermost layer.

description = """Fungus anamorphicus. Coloniae in substrato naturali eﬀusae, nigrae. Mycelium
superﬁciale, ex hyphis ramosis, septatis, pallide brunneis vel brunneis, laevibus, 1.5–3
μm crassis compositum. Conidiophora nulla vel brevis, 1–3-septata, brunnea vel
atrobrunnea, 11–28 × 4.5–5 μm. Cellula conidiogena monoblastica, determinatae,
solitaria, simplicia, lageniformia vel ampulliformia, brunnea vel atrobrunnea, laevia,
4.5–6.5 × 3.5–5 μm, ad apicem 3–4.5 μm crassa et truncatae. Conidiorum secessio
schizolytica. Conidia holoblastica, solitaria, acrogena, recta vel curvata, obclavata vel
obclavata-rostrata, atrobrunnea vel brunnea, laevia, 13–19-distoseptata, 130–190 μm
longa, 7–9 μm crassa, apicem versus ad 2–3 μm attenuata; cellula apicalis rotundata;
cellula basalis cylindrica vel conico-truncata, ad basim 3.5–4.5 μm crassa; Appendicibus
lateralibus 0–2, brunneae, septata, cylindricae, surgentibus ex cellulla e apicem 2nd vel
3rd.

Anamorphic fungi. Colonies on natural substrate eﬀuse, black. Mycelium
superﬁcial, composed of branched, septate, pale brown to brown, smoothwalled hyphae, 1.5–3 μm thick. Conidiophores absent or short, 1–3-septate,
brown to dark brown, 11–28 × 4.5–5 μm. Conidiogenous cells monoblastic,
determinate, solitary, simple, lageniform or ampulliform, brown to dark brown,
smooth, 4.5–6.5 × 3.5–5 μm, 3–4.5 μm wide at the truncate apex. Conidial
secession schizolytic. Conidia holoblastic, solitary, acrogenous, straight or
curved, obclavate to obclavate-rostrate, dark brown to brown, smooth, 13–19distoseptate, 130–190 μm long, 7–9 μm thick in the broadest part, tapering
to 2–3 μm near the apex; apical cells rounded; basal cell cylindrical, truncate,
3.5–4.5 μm wide; lateral appendages 0–2, brown, septate, cylindrical, arising
from the 2nd or 3rd cells from the apex.
"""

In [None]:
response = ollama.chat(model=preferred_model, messages=[{
    'role': 'user',
    'content': f'{prompt}:\n\n{description}'},
])
print('Result:')
print(response['message']['content'])

In [None]:
def load_json_training(filename: str) -> List[Dict[str, Any]]:
  retval = []
  state = 'START'  # 'description', 'result'
  with open(filename, "r", encoding="utf-8") as file:
    lines = []
    description = ''
    for line in file:
      if line.startswith('Send to LLM:'):
        if state == "result":
          result = ''.join(lines)
          try:
            result_dict = json.loads(result)
          except json.JSONDecodeError as err:
            print(f'Err: {err}\n{result}')
          retval.append({'description': description, 'result': json.dumps(result_dict)})
        lines = []
        state = 'description'
      elif line.startswith('Result:'):
        if state == "description":
          description = ''.join(lines)
          lines = []
        state = 'result'
      else:
        lines.append(line)
    if state == 'result' and len(lines) > 0:
      result = ''.join(lines)
      try:
        result_dict = json.loads(result)
      except json.JSONDecodeError as err:
        print(f'Err: {err}\n{result}')
      retval.append({'description': description, 'result': json.dumps(result_dict)})
  return retval


In [None]:
json_training = load_json_training(workdir / 'json_training.txt')
print(json_training[0])

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets

In [None]:
#log in to hugging face client to access model
#!huggingface-cli login

In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig
base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

In [None]:
import datasets

dataset = datasets.Dataset.from_list(json_training)

new_dataset = datasets.Dataset.train_test_split(dataset,int(1))
temp_dataset = new_dataset["train"]
test_dataset = new_dataset["test"]
new_dataset2 = datasets.Dataset.train_test_split(temp_dataset,int(1))
train_dataset = new_dataset2["train"]
val_dataset = new_dataset2["test"]

print(train_dataset,val_dataset,test_dataset)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""{prompt}:
    description:
    {data_point["description"]}
    result:
    {data_point["result"]}
"""
    return tokenize(full_prompt)

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = val_dataset.map(generate_and_tokenize_prompt)
tokenized_test_dataset = test_dataset.map(generate_and_tokenize_prompt)
eval_prompt = f"""{prompt}:
    description:
    {train_dataset[0]["description"]}
    result:
"""

In [None]:
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))
