<a href="https://colab.research.google.com/github/piggyatbaqaqi/skol/blob/master/IST691/workspace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SKOL III: Feature extraction

created by:
* La Monte Yarroll
* Padmaja Kurmaddali
* Patrick Le

# **Mapping Data Files: Google Drive**

In [6]:
import os, sys
from pathlib import Path
from google.colab import drive
%cd /content
skol = Path('/content/drive/My Drive/SKOL')
piggyatbaqaqi = skol / 'github.com/piggyatbaqaqi'
drive.mount("/content/drive", force_remount=True)
cache_path = '/content/cache'
nb_path = '/content/packages'
if not os.path.exists(nb_path):
  os.symlink(skol / 'packages', nb_path)
skol_path = '/content/skol'
if not os.path.exists(skol_path):
  os.symlink(piggyatbaqaqi / 'skol', skol_path)
if not os.path.exists(cache_path):
  os.symlink(piggyatbaqaqi / 'pip_cache', cache_path)
sys.path.insert(0, nb_path)
sys.path.insert(0, str(piggyatbaqaqi / 'skol'))

/content
Mounted at /content/drive


In [7]:
!ls /content/drive/MyDrive/SKOL/

annotated_2025_02_27  github.com  IST718    raw_2025_02_05
Annotated_Raw	      IST691	  packages  Test_Data


## Set up git clients

In [2]:
if not os.path.exists(piggyatbaqaqi):
  %mkdir -p $piggyatbaqaqi
if not os.path.exists(piggyatbaqaqi / 'skol'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/skol.git
sys.path.insert(0, piggyatbaqaqi / 'skol')
if not os.path.exists(piggyatbaqaqi / 'dr-drfts-mycosearch'):
  %cd $piggyatbaqaqi
  !git clone https://github.com/piggyatbaqaqi/dr-drafts-mycosearch.git
workdir = skol / 'IST691'
%cd $workdir

/content/drive/.shortcut-targets-by-id/1oRBBGLhTLr9k0-7Qy5rCh8TSd-zhH2mz/SKOL/IST691


In [8]:
# install PySpark
! pip install --cache-dir=$cache_path --target=$nb_path --upgrade pyspark sparknlp

Collecting pyspark
  Downloading pyspark-3.5.5.tar.gz (317.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.2/317.2 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sparknlp
  Downloading sparknlp-1.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting spark-nlp (from sparknlp)
  Downloading spark_nlp-6.0.0-py2.py3-none-any.whl.metadata (19 kB)
Collecting numpy (from sparknlp)
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading spa

In [34]:
#import needed modules
import os
import glob
from typing import List
from pathlib import Path
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.functions import (
    input_file_name, collect_list, concat_ws, col, udf,
    explode, collect_list, regexp_extract, regexp_replace,
    split, flatten, transform, concat)
from pyspark.sql.types import ArrayType, StringType

from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import (
    Tokenizer, HashingTF, IDF, StringIndexer, CountVectorizer,
    PCA, VectorAssembler)
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import split, row_number, min, expr, struct
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector
from pyspark.sql.types import DoubleType, StructField, StructType

SKOL Data manipulation library

In [45]:
from finder import read_files, parse_annotated, target_classes
from label import Label
from taxon import Taxon, group_paragraphs

SEED=12345
default_label = Label('Misc-exposition')
keep_labels = [Label('Description'), Label('Nomenclature')]

# **Checking the file counts in the directories**

In [13]:
raw_directory_path = skol / 'raw_2025_02_05/'
ann_directory_path = skol / 'annotated_2025_02_27/journals'

## Checking the file counts in the directories

In [14]:
# Function that reports all the txt files under a Google Drive folder path
def listFiles(folder: str) -> List[str]:
  # List all files in the folder
  try:
      files = [file for file in glob.glob(f'{folder}/**/*.txt*', recursive=True) if 'Sydowia' not in file]
      return files
  except FileNotFoundError:
      print(f"Folder '{folder}' not found.")
  except PermissionError:
      print(f"Permission denied to access folder '{folder}'.")

In [15]:
# check files in raw directory
listFiles(raw_directory_path)[:10]

['/content/drive/My Drive/SKOL/raw_2025_02_05/authors/FungalNameAuthors.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753420.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753372.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753466.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753104.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753322.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/mycologiaeuropa00persgoog.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753038.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753160.txt',
 '/content/drive/My Drive/SKOL/raw_2025_02_05/Mycologia/3753583.txt']

In [30]:
# check files in annotated directory
training_files = listFiles(ann_directory_path)
training_files[:10]

['/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol057/n1.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol054/n1.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s17.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s29.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s30.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s7.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s21.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s13.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s1.txt.ann',
 '/content/drive/My Drive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s46.txt.ann']

In [31]:
len(training_files)

190

In [35]:
paragraphs = list(parse_annotated(read_files(random.sample(training_files, 5))))

In [36]:
relabeled = list(target_classes(default=default_label, keep=keep_labels, paragraphs=paragraphs))

In [41]:
print(f'len(relabeled): {len(relabeled)}')

len(relabeled): 13333


In [42]:
df = pd.DataFrame({
    'filename': [pp.filename for pp in relabeled],
    'label': [pp.top_label().label if pp.top_label() else None for pp in relabeled],
    'paragraph_number': [pp.paragraph_number for pp in relabeled],
    'page_number': [pp.page_number for pp in relabeled],
    'empirical_page_number': [pp.empirical_page_number for pp in relabeled],
    'line_number': [pp.first_line.line_number if pp.first_line else None for pp in relabeled],
    'body': [str(pp) for pp in relabeled]
})
df.label = pd.Categorical(df.label)
df['label_code'] = df.label.cat.codes

In [44]:
df.groupby('label', observed=True).nunique()

Unnamed: 0_level_0,filename,paragraph_number,page_number,empirical_page_number,line_number,body,label_code
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Description,5,550,194,172,67,550,1
Misc-exposition,5,12351,572,446,163,6900,1
Nomenclature,5,432,167,154,64,431,1


In [55]:
grouped = group_paragraphs(relabeled)
for i, tax in enumerate(grouped):
  if i > 10:
    break
  print(f'Send to LLM:\n\n{tax.as_row()["description"]}')

Send to LLM:

Maculae hypogenae, atro-brunneae vel nigrae. Coloniae hypophyllae, eﬀusae. Mycelium
internum. Stromata parva. Conidiophora macronematosa, ex hyphis oriunda singulata vel
2–6-fasciculata, simplicia, erecta vel procumbentia, recta vel ﬂexuosa, geniculata, laevia,
crassitunicata, pallide brunnea vel atro-brunnea, 2–7-septata, 22–105 × 3–6 µm. Cellulae
conidiogenae integratae, terminales vel intercalares, polyblasticae, cicatricatae, cicatrices
incrassatae. Conidia simplicia, sicca, acropleurogena, solitaria, cylindricata, 3–11-septata,
recta vel leniter curvata, olivacea vel olivaceo-brunnea, laevia, tenui-tunicata, apices
obtusae, ad basim obconicotruncata, hila incrassata, 35–82 × 2–6 µm, hila 1.5–2.5 µm
lata.

Infection spots hypogenous, minute, dark brown to black. Colonies
hyphophyllous, eﬀuse. Mycelium internal. Stromata absent or poorly developed.
Conidiophores macronematous, arising singly from internal hyphae or in a
fascicle of 2–6 from poorly developed stromata, s