<a href="https://colab.research.google.com/github/piggyatbaqaqi/skol/blob/master/IST691/workspace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SKOL III: Feature extraction

created by:
* La Monte Yarroll
* Padmaja Kurmaddali
* Patrick Le

# **Mapping Data Files: Google Drive**

In [1]:
import os, sys
from pathlib import Path
from google.colab import drive
%cd /content
content = Path('/content')
skol = content / 'drive/My Drive/SKOL'
piggyatbaqaqi = skol / 'github.com/piggyatbaqaqi'
drive.mount(str(content / "drive"), force_remount=True)
cache_path = content / 'cache'
ollama_cache_path = content / 'ollama_cache'
nb_path = content / 'packages'
if not os.path.exists(nb_path):
  nb_path.symlink_to(skol / 'packages')
skol_client = content / 'skol'
if not os.path.exists(skol_client):
  skol_client.symlink_to(piggyatbaqaqi / 'skol')
if not os.path.exists(cache_path):
  cache_path.symlink_to(skol / 'pip_cache')
if not os.path.exists(ollama_cache_path):
  ollama_cache_path.symlink_to(skol / 'ollama_cache')
os.environ['OLLAMA_MODELS'] = str(ollama_cache_path)
sys.path.insert(0, str(nb_path))
sys.path.insert(0, str(piggyatbaqaqi / 'skol'))

/content
Mounted at /content/drive


In [2]:
!ls -l /content/
!file /content
print(sys.path)

total 8
lrwxrwxrwx 1 root root   38 May 19 20:18 cache -> '/content/drive/My Drive/SKOL/pip_cache'
drwx------ 7 root root 4096 May 19 20:18 drive
lrwxrwxrwx 1 root root   41 May 19 20:18 ollama_cache -> '/content/drive/My Drive/SKOL/ollama_cache'
lrwxrwxrwx 1 root root   37 May 19 20:18 packages -> '/content/drive/My Drive/SKOL/packages'
drwxr-xr-x 1 root root 4096 May 14 13:38 sample_data
lrwxrwxrwx 1 root root   58 May 19 20:18 skol -> '/content/drive/My Drive/SKOL/github.com/piggyatbaqaqi/skol'
/content: directory
['/content/drive/My Drive/SKOL/github.com/piggyatbaqaqi/skol', '/content/packages', '/content', '/env/python', '/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '', '/usr/local/lib/python3.11/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.11/dist-packages/IPython/extensions', '/usr/local/lib/python3.11/dist-packages/setuptools/_vendor', '/root/.ipython']


## Set up git clients

In [3]:
if not os.path.exists(piggyatbaqaqi):
  %mkdir -p $piggyatbaqaqi
if not os.path.exists(piggyatbaqaqi / 'skol'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/skol.git
sys.path.insert(0, piggyatbaqaqi / 'skol')
if not os.path.exists(piggyatbaqaqi / 'dr-drafts-mycosearch'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/dr-drafts-mycosearch.git
workdir = skol / 'IST691'
%cd $workdir

/content/drive/My Drive/SKOL/IST691


In [4]:
# install PySpark
! pip install --cache-dir=$cache_path --target=$nb_path pyspark
! pip install --cache-dir=$cache_path --target=$nb_path sparknlp ollama

Collecting pyspark
  Using cached pyspark-3.5.5-py2.py3-none-any.whl
Collecting py4j==0.10.9.7 (from pyspark)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.5
[0mCollecting sparknlp
  Using cached sparknlp-1.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting ollama
  Using cached ollama-0.4.8-py3-none-any.whl.metadata (4.7 kB)
Collecting spark-nlp (from sparknlp)
  Using cached spark_nlp-6.0.1-py2.py3-none-any.whl.metadata (19 kB)
Collecting numpy (from sparknlp)
  Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting httpx<0.29,>=0.27 (from ollama)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pydantic<3.0.0,>=2.9.0 (from ollama)
  Using cached pydantic-2.11.4-py3-none-any.whl.metadata (66 kB)
Collecting anyio (from httpx<0.29,>=0.

In [5]:
#import needed modules
import os
import glob
from typing import List
from pathlib import Path
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.functions import (
    input_file_name, collect_list, concat_ws, col, udf,
    explode, collect_list, regexp_extract, regexp_replace,
    split, flatten, transform, concat)
from pyspark.sql.types import ArrayType, StringType

from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import (
    Tokenizer, HashingTF, IDF, StringIndexer, CountVectorizer,
    PCA, VectorAssembler)
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import split, row_number, min, expr, struct
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector
from pyspark.sql.types import DoubleType, StructField, StructType

SKOL Data manipulation library

In [6]:
from finder import read_files, parse_annotated, target_classes
from label import Label
from taxon import Taxon, group_paragraphs

SEED=12345
default_label = Label('Misc-exposition')
keep_labels = [Label('Description'), Label('Nomenclature')]

# **Checking the file counts in the directories**

In [7]:
raw_directory_path = skol / 'raw_2025_02_05/'
ann_directory_path = skol / 'annotated_2025_02_27/journals'

## Checking the file counts in the directories

In [8]:
# Function that reports all the txt files under a Google Drive folder path
def listFiles(folder: str) -> List[str]:
  # List all files in the folder
  try:
      files = [file for file in glob.glob(f'{folder}/**/*.txt*', recursive=True) if 'Sydowia' not in file]
      return files
  except FileNotFoundError:
      print(f"Folder '{folder}' not found.")
  except PermissionError:
      print(f"Permission denied to access folder '{folder}'.")

In [9]:
# check files in raw directory
listFiles(raw_directory_path)[:10]

['/content/drive/My Drive/SKOL/raw_2025_02_05/authors/FungalNameAuthors.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753420.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753372.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753466.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753104.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753322.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/mycologiaeuropa00persgoog.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753038.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753160.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753583.txt']

In [10]:
# check files in annotated directory
training_files = listFiles(ann_directory_path)
training_files[:10]

['/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol057/n1.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol054/n1.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s17.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s29.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s30.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s7.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s21.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s13.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s1.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s46.txt.ann']

In [11]:
len(training_files)

190

In [12]:
paragraphs = list(parse_annotated(read_files(random.sample(training_files, 5))))

In [13]:
relabeled = list(target_classes(default=default_label, keep=keep_labels, paragraphs=paragraphs))

In [14]:
print(f'len(relabeled): {len(relabeled)}')

len(relabeled): 324


In [15]:
df = pd.DataFrame({
    'filename': [pp.filename for pp in relabeled],
    'label': [pp.top_label().label if pp.top_label() else None for pp in relabeled],
    'paragraph_number': [pp.paragraph_number for pp in relabeled],
    'page_number': [pp.page_number for pp in relabeled],
    'empirical_page_number': [pp.empirical_page_number for pp in relabeled],
    'line_number': [pp.first_line.line_number if pp.first_line else None for pp in relabeled],
    'body': [str(pp) for pp in relabeled]
})
df.label = pd.Categorical(df.label)
df['label_code'] = df.label.cat.codes

In [16]:
df.groupby('label', observed=True).nunique()

Unnamed: 0_level_0,filename,paragraph_number,page_number,empirical_page_number,line_number,body,label_code
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Description,5,17,4,13,11,12,1
Misc-exposition,5,301,9,27,64,216,1
Nomenclature,5,6,4,5,6,6,1


In [17]:
#load in library to open terminal inside google colab
# !pip install  --cache-dir=$cache_path --target=$nb_path colab-xterm
# %load_ext colabxterm


In [31]:
!ls /usr/local/lib/ollama

cuda_v11		  libggml-cpu-haswell.so      libggml-cpu-sse42.so
cuda_v12		  libggml-cpu-icelake.so      libggml-cpu-x64.so
libggml-base.so		  libggml-cpu-sandybridge.so
libggml-cpu-alderlake.so  libggml-cpu-skylakex.so


In [37]:
import subprocess
import time

import ollama
#open up terminal
# %xterm
#once open, first time do:
if not os.path.exists("/usr/local/lib/ollama"):
  !curl https://ollama.ai/install.sh | sh
ollama_server = subprocess.Popen(["ollama", "serve"])
time.sleep(5)  # Let the server come all the way up.
#then start the server with ollama serve &
#first time also will need to pull in a ollama version using ollama pull mistral
# preferred_model = 'gemma3:12b'
preferred_model = 'mistral'
found = False
for _, l in ollama.list():
  for m in l:
    if m.model.startswith(preferred_model):
      found = True
      break
if not found:
  print(f"Pulling model {preferred_model}")
  !ollama pull $preferred_model


In [38]:
#check here if ollama has a version and can be used, will say model you pulled
!ollama list
# !ollama pull mistral
# !ollama pull gemma3:12b

NAME              ID              SIZE      MODIFIED       
mistral:latest    f974a74358d6    4.1 GB    36 seconds ago    
gemma3:12b        f4031aab637d    8.1 GB    19 hours ago      


In [39]:
    import ollama

    response = ollama.generate(
        model="gemma3:12b",
        prompt="What is the capital of France?"
    )
    print(response["response"])

ResponseError: "gemma3:12b" does not support generate (status code: 400)

In [40]:
#test if ollama call works
import ollama
response = ollama.generate(model=preferred_model, prompt='Why is the sky blue?')
print(response['response'])

 The sky appears blue due to a process called Rayleigh scattering. As sunlight reaches Earth, it is made up of different colors, each of which are different wavelengths of light. Shorter wavelengths, such as blue and violet, scatter more easily than longer ones, like red and yellow, because they have smaller wavelengths and more energetic photons.

The molecules in the Earth's atmosphere mainly consist of nitrogen and oxygen, which are mostly diatomic (two atoms bonded together). These molecules scatter shorter-wavelength light much more than longer-wavelength light. When we look up at the sky, we see a blend of scattered blue and violet light that our eyes perceive as blue because humans are more sensitive to blue light and violet light has already been largely absorbed by the ozone layer before it reaches us.

However, it's important to note that the sky may appear different colors at sunrise or sunset due to a separate phenomenon called scattering by dust, clouds, or molecules in th

In [42]:
grouped = group_paragraphs(relabeled)
for i, tax in enumerate(grouped):
  if i > 10:
    break
  print(f'Send to LLM:\n\n{tax.as_row()["description"]}')
  #sample message to turn data into json format
  response = ollama.chat(model=preferred_model, messages=[{
     'role': 'user',
     'content': f'Please extract features, subfeatures, and values from the following species description. Format the response as json:\n{tax.as_row()["description"]}',},
  ])
  print('Result:')
  print(response['message']['content'])

Send to LLM:

Basidiomata medium to large, ﬂeshy. Pileus 80 to 110 mm in diameter,
subglobose or hemispherical when young, and becoming convex to applanate
at maturity; pellicle brown (6E8) to coﬀee (6E5-6) or chocolate brown (6E5,
6F5), thin to thick when young and somewhat brown to grayish brown (7E35) at maturity; the remaining surface covered with grayish brown (7E3-5)
to vinaceous brown (6D8, 8E8) squamules, together with numerous, small,
revolute and loosely ﬂoccose, brown squamules; context up to 9 mm thick
in the center of the pileus, white, instantly turning reddish with exposure.
Lamellae 47 × 10 mm, free and remote from stipe, white to dirty white when
young, olive brown when mature, becoming red brown after bruising, crowded
with lamellulae, margin entire, concolorous. Stipe 80−121 × (28−)31−47(−61)
mm, central, subcylindrical, solid but ﬁstulose in aged specimens; surface dirty
white to white at the apex, light brown to brown toward the base, glabrous
above the annulus, lo