# SKOL II: Preprocessing

created by:
* Christopher Murphy
* La Monte Yarroll
* David Caspers

In [1]:
# install PySpark
! pip install pyspark >& /dev/null
! pip install sparknlp >& /dev/null

In [43]:
#import needed modules
import os
import glob
from typing import List
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.functions import (
    input_file_name, collect_list, concat_ws, col, udf,
    explode, collect_list, regexp_extract, regexp_replace,
    split, flatten, transform)
from pyspark.sql.types import ArrayType, StringType

from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StringIndexer, CountVectorizer, PCA
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import split, row_number, min
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector
from pyspark.sql.types import DoubleType, StructField, StructType

# **Mapping Data Files: Google Drive**

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Checking the file counts in the directories**

In [4]:
raw_directory_path = '/content/drive/MyDrive/SKOL/raw_2025_02_05/'
ann_directory_path = '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals'

## Checking the file counts in the directories

In [5]:
# Function that reports file counts for a Google Drive folder path
def listFiles(folder: str) -> List[str]:
  # List all files in the folder
  try:
      files = [file for file in glob.glob(f'{folder}/**/*.txt*', recursive=True) if 'Sydowia' not in file]
      return files
  except FileNotFoundError:
      print(f"Folder '{folder}' not found.")
  except PermissionError:
      print(f"Permission denied to access folder '{folder}'.")

In [6]:
# check file counts in raw directory
listFiles(raw_directory_path)[:10]

['/content/drive/MyDrive/SKOL/raw_2025_02_05/authors/FungalNameAuthors.txt',
 '/content/drive/MyDrive/SKOL/raw_2025_02_05/Mycologia/3753420.txt',
 '/content/drive/MyDrive/SKOL/raw_2025_02_05/Mycologia/3753372.txt',
 '/content/drive/MyDrive/SKOL/raw_2025_02_05/Mycologia/3753466.txt',
 '/content/drive/MyDrive/SKOL/raw_2025_02_05/Mycologia/3753104.txt',
 '/content/drive/MyDrive/SKOL/raw_2025_02_05/Mycologia/3753322.txt',
 '/content/drive/MyDrive/SKOL/raw_2025_02_05/Mycologia/mycologiaeuropa00persgoog.txt',
 '/content/drive/MyDrive/SKOL/raw_2025_02_05/Mycologia/3753038.txt',
 '/content/drive/MyDrive/SKOL/raw_2025_02_05/Mycologia/3753160.txt',
 '/content/drive/MyDrive/SKOL/raw_2025_02_05/Mycologia/3753583.txt']

In [7]:
# check file counts in annotated directory
listFiles(ann_directory_path)[:10]

['/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol057/n1.txt.ann',
 '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol054/n1.txt.ann',
 '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s17.txt.ann',
 '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s29.txt.ann',
 '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s30.txt.ann',
 '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s7.txt.ann',
 '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s21.txt.ann',
 '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s13.txt.ann',
 '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s1.txt.ann',
 '/content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycotaxon/Vol118/s46.txt.ann']

In [8]:
#start spark nlp session
spark = sparknlp.start()

In [9]:
# read in the content into spark dataframes
raw_df = spark.read.text(listFiles(raw_directory_path)).withColumn("filename", input_file_name())

In [10]:
# Read the annotated dataframe
ann_df_agg = spark.read.text(listFiles(ann_directory_path)).withColumn("filename", input_file_name())

# UDF to extract and aggregate text between '[' and ']'
def extract_paragraphs(lines):
    aggregated = []
    temp = []
    inside_brackets = False

    for line in lines:
        stripped_line = line.strip()
        if stripped_line.startswith("["):
            inside_brackets = True
            temp = [stripped_line]  # Start a new paragraph
        elif stripped_line.endswith("]") and inside_brackets:
            temp.append(stripped_line)
            aggregated.append(" ".join(temp))  # Store the completed paragraph
            temp = []
            inside_brackets = False
        elif inside_brackets:
            temp.append(stripped_line)

    return aggregated

extract_udf = udf(extract_paragraphs, ArrayType(StringType()))

def collapse_labels(label):
  if label not in ['Nomenclature', 'Description']:
    return 'Misc-exposition'
  return label

collapse_udf = udf(collapse_labels, StringType())

grouped_ann_df = (
    ann_df_agg.groupBy("filename")
    .agg(collect_list("value").alias("lines"))
    .withColumn("value", explode(extract_udf(col("lines"))))
    .drop("lines")
)

# Define regex pattern to extract label
label_pattern = r"#(\S+?)(?:\*)?]"  # Captures text after '#' and removes trailing '*', if present
lead_pattern = r"^\[@"              # start of line with "[@" character
trail_pattern = label_pattern+r"$"  # end of line
clean_pattern = lead_pattern+r"(.*)"+trail_pattern  #trim lead and trail patterns

grouped_ann_df = grouped_ann_df\
  .withColumn("label", regexp_extract(col("value"), label_pattern, 1))\
  .withColumn("value", regexp_extract(col("value"), clean_pattern, 1))\
  .withColumn("label", collapse_udf(col("label")))

# *** Remove this code when move to github application code ***
grouped_ann_df.show(10, truncate=False)

+--------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|filename                                                                                    |value                                                                                                                  

# Pipeline Code


In [49]:
# Tokenizing documents using Tokenizer
tokenizer = Tokenizer(inputCol="value", outputCol="words")

# Using CountVectorizer for term frequency
count_vectorizer = CountVectorizer(inputCol="words", outputCol="tf")

# IDF for term frequency transformation
idf = IDF(inputCol="tf", outputCol="idf", minDocFreq=10)

# Converting text labels to index numbers
indexer = StringIndexer(inputCol="label", outputCol="label_indexed")

# Creating a Pipeline
pipeline_feature_extraction = Pipeline(stages=[tokenizer, count_vectorizer, idf, indexer])

#fit the model to the full dataset and capture the dataset created by the pipeline
pipeline_model = pipeline_feature_extraction.fit(grouped_ann_df)
features = pipeline_model.transform(grouped_ann_df)

In [12]:
# split the dataset between train (80% and test (20%)
train_data, test_data = features.randomSplit([0.8, 0.2], seed=42)

# Logistic Regression

In [13]:
# Logistic Regression Model
logistic_regression = LogisticRegression(family="multinomial", maxIter=10, regParam=0.01,
                                         featuresCol="idf", labelCol="label_indexed")

# Creating a Pipeline for the regression step
pipeline_lr = Pipeline(stages=[logistic_regression])

# Fitting the pipeline to the dataset
log_model = pipeline_lr.fit(train_data)

# Making predictions on the test dataset
predictions_lr = log_model.transform(test_data)

In [15]:
# Show sample predictions
predictions_lr.select("label_indexed", "prediction", "idf").show(10)

+-------------+----------+--------------------+
|label_indexed|prediction|                 idf|
+-------------+----------+--------------------+
|          0.0|       0.0|(201112,[0,1,2,5,...|
|          1.0|       1.0|(201112,[3,58,81,...|
|          0.0|       0.0|(201112,[145,224,...|
|          0.0|       0.0|(201112,[1,7,43,4...|
|          1.0|       1.0|(201112,[2,4,5,7,...|
|          0.0|       0.0|(201112,[0,1,2,4,...|
|          0.0|       2.0|(201112,[0,1,2,3,...|
|          1.0|       1.0|(201112,[0,1,2,3,...|
|          0.0|       0.0|(201112,[1,5,12,2...|
|          0.0|       0.0|(201112,[7,8,12,2...|
+-------------+----------+--------------------+
only showing top 10 rows



In [16]:
# Create evaluators for each Metric
# MulticlassClassificationEvaluator is used for classification models
# label_indexed specifies the labels from the dataset
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="accuracy")

precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="precisionByLabel")

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="recallByLabel")

f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="f1")

def stats(prediction):
  '''apply the evaluators to the prediction dataframe to calculate metric scores'''
  accuracy = accuracy_evaluator.evaluate(predictions)
  precision = precision_evaluator.evaluate(predictions)
  recall = recall_evaluator.evaluate(predictions)
  f1_score = f1_evaluator.evaluate(predictions)
  # report performance measures
  print(f"Test Accuracy: {accuracy:.4f}")
  print(f"Test Precision: {precision:.4f}")
  print(f"Test Recall: {recall:.4f}")
  print(f"Test F1 Score: {f1_score:.4f}")

In [18]:
stats(predictions_lr)

Test Accuracy: 0.9418
Test Precision: 0.9603
Test Recall: 0.9666
Test F1 Score: 0.9414


# Random Forest Classifier

In [19]:
#define the random forrest model
random_forest = RandomForestClassifier(seed=42, labelCol="label_indexed", featuresCol="idf", numTrees=100)

# Creating a Pipeline for the regression step
pipeline_rf = Pipeline(stages=[random_forest])

# Fitting the pipeline to the dataset
rf_model = pipeline_rf.fit(train_data)

# Making predictions on the test dataset
predictions_rf = rf_model.transform(test_data)

In [20]:
# Show sample predictions
predictions_rf.select("label_indexed", "prediction", "idf").show(10)

+-------------+----------+--------------------+
|label_indexed|prediction|                 idf|
+-------------+----------+--------------------+
|          0.0|       0.0|(201112,[0,1,2,5,...|
|          1.0|       0.0|(201112,[3,58,81,...|
|          0.0|       0.0|(201112,[145,224,...|
|          0.0|       0.0|(201112,[1,7,43,4...|
|          1.0|       0.0|(201112,[2,4,5,7,...|
|          0.0|       0.0|(201112,[0,1,2,4,...|
|          0.0|       0.0|(201112,[0,1,2,3,...|
|          1.0|       0.0|(201112,[0,1,2,3,...|
|          0.0|       0.0|(201112,[1,5,12,2...|
|          0.0|       0.0|(201112,[7,8,12,2...|
+-------------+----------+--------------------+
only showing top 10 rows



In [21]:
# report performance measures
stats(predictions_rf)

Test Accuracy: 0.7878
Test Precision: 0.7878
Test Recall: 1.0000
Test F1 Score: 0.6943


# Principle Component Analysis

In [22]:
# Define a UDF to convert sparse vectors to dense vectors
def sparse_to_dense_idf(sparse_idf):
    #return (sparse_idf[0], Vectors.dense(sparse_idf[1].toArray()))
    #return (sparse_idf[0], sparse_idf[1].toArray())
    return sparse_idf.toArray()

#sparse_to_dense_udf = udf(sparse_to_dense_idf, StructType(
#    [StructField("index", DoubleType()),
#     StructField("denseidf", VectorUDT())]))

sparse_to_dense_udf = udf(sparse_to_dense_idf, VectorUDT())

In [23]:
# Define a UDF to convert sparse vectors to dense vectors
def sparse_to_dense(sparse_vector):
    return Vectors.dense(sparse_vector.toArray())

# Register the UDF with the correct return type
sparse_to_dense_udf = udf(sparse_to_dense, VectorUDT())

# Apply the UDF to convert sparse features to dense
dense_features = features.withColumn("denseidf", sparse_to_dense_udf(col("idf")))

### This is not computationally feasible on colab. It ran for more than a day.
```
### execute PCA to reduce dimensions
pca = PCA(k=10, inputCol="denseidf", outputCol="pca_features")

### fit the model on the reduced PCA
model = pca.fit(dense_features)

### run the model
result = model.transform(dense_features)
```


# Suffixes

In [24]:
def suffixes(words):
  retval = []
  for word in words:
    retval.extend([word[-2:], word[-3:], word[-4:]])
  return retval

suffixes_udf = udf(suffixes, ArrayType(StringType()))

class SuffixTransformer(Transformer):
    def __init__(self, inputCol, outputCol):
        super().__init__()
        self.input_col = inputCol
        self.output_col = outputCol

    def _transform(self, df):
      if isinstance(df.schema[self.input_col].dataType, ArrayType):
        return df.withColumn(
            self.output_col,
            suffixes_udf(df[self.input_col]))
      else:
        raise ValueError(f"Column '{self.input_col}' is not an array type but a {type(self.input_col)}.")


suffixer = SuffixTransformer(inputCol="words", outputCol="suffixes")

# Using CountVectorizer for term frequency
count_vectorizer = CountVectorizer(inputCol="suffixes", outputCol="tf")

# Creating a Pipeline
pipeline_feature_extraction = Pipeline(stages=[tokenizer, suffixer, count_vectorizer, idf, indexer])
# pipeline_feature_extraction = Pipeline(stages=[tokenizer, suffixer])

#fit the model to the full dataset and capture the dataset created by the pipeline
pipeline_suffixes_model = pipeline_feature_extraction.fit(grouped_ann_df)
features_suffixes = pipeline_suffixes_model.transform(grouped_ann_df)
features_suffixes.show(10)

+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+-------------+
|            filename|               value|          label|               words|            suffixes|                  tf|                 idf|label_indexed|
+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+-------------+
|file:///content/d...|Published by Rijk...|Misc-exposition|[published, by, r...|[ed, hed, shed, b...|(75108,[9,17,20,2...|(75108,[9,17,20,2...|          0.0|
|file:///content/d...|The genus Crepido...|Misc-exposition|[the, genus, crep...|[he, the, the, us...|(75108,[0,1,2,4,5...|(75108,[0,1,2,4,5...|          0.0|
|file:///content/d...|Morphological, ec...|Misc-exposition|[morphological,, ...|[l,, al,, cal,, a...|(75108,[1,2,4,5,6...|(75108,[1,2,4,5,6...|          0.0|
|file:///content/d...|Fries (1821: 272)...|Misc-expo

In [25]:
# split the dataset between train (80% and test (20%)
train_data_suffix, test_data_suffix = features_suffixes.randomSplit([0.8, 0.2], seed=42)

In [26]:
# Logistic Regression Model
logistic_regression = LogisticRegression(family="multinomial", maxIter=10, regParam=0.01,
                                         featuresCol="idf", labelCol="label_indexed")

# Creating a Pipeline for the regression step
pipeline = Pipeline(stages=[logistic_regression])

# Fitting the pipeline to the dataset
log_model = pipeline.fit(train_data_suffix)

# Making predictions on the test dataset
predictions_suffix = log_model.transform(test_data_suffix)

In [27]:
stats(predictions_suffix)

Test Accuracy: 0.9396
Test Precision: 0.9593
Test Recall: 0.9649
Test F1 Score: 0.9392


# Process Raw Texts



In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, lit, sum, collect_list, explode, trim, monotonically_increasing_id, input_file_name
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, StringType
import regex as re
from google.colab import drive

## Heuristically ID Paragraphs
Steps:
1. Import files (with unique id)
2. group by file name
3. Apply paragraph extractor udf
4. Explode each paragraph to separate row
5. Remove interstitial paragraphs


In [30]:
# data_path = "/content/drive/MyDrive/IST718_Final_Project/SKOL/Test_Data"
data_path = ann_directory_path
# Recursively read all text files in the folder
df = spark.read.option("recursiveFileLookup", "true").text(data_path).withColumn("filename", input_file_name()).withColumn("line_id", monotonically_increasing_id())

In [31]:
df.show(10, truncate = False)

+----------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+-------+
|value                                               |filename                                                                                                                       |line_id|
+----------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+-------+
|[@IMPERIAL INSTITUTE                                |file:///content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycologia/Vol41/2015.28594.Mycologia-Vol-xli-1949_text.txt.ann|0      |
|                                                    |file:///content/drive/MyDrive/SKOL/annotated_2025_02_27/journals/Mycologia/Vol41/2015.28594.Mycologia-Vol-xli-1949_text.txt.ann|1      |
| OF                                         

In [32]:
#Define regex pattern to detect nomenclature
NOMENCLATURE_RE = re.compile(
    r'^([-\w≡=.*|:]*\s+)?'
    r'('
        r'([A-Z]\w*(aceae|idae|iformes|ales|ineae|inae)?)'
        r'|(\w+\b)'
    r')'
    r'\s'
    r'('
        r'(\w+(aceae|idae|iformes|ales|ineae|inae)?)'
        r'|(\w+\b)'
    r')'
    r'.*'
    r'('
        r'nov\.|'
        r'nov\.\s?(comb\.|sp\.)|'
        r'\(?in\.?\s?ed\.?\)?|'
        r'\(?nom\.\s?(prov\.|sanct\.)\)?|'
        r'emend\..*|'
        r'\(?\b[12]\d{3}\b(?:\s+.[12]\d{3}\b.)?\)?[^\n]{0,9}'
    r')'
    r'[-\s—]*'
    r'(\(?Fig|Plate\)?)?$'
)

TABLE_KEYWORDS = ['table', 'tab.', 'tab', 'tbl.', 'tbl']
FIGURE_KEYWORDS = ['fig', 'fig.', 'figg.', 'figs', 'figs.', 'figure', 'photo', 'plate', 'plates']
taxon_pattern = (
    r'(nov\.|nov\.\s?(comb\.|sp\.)|\(?in\.?\s?ed\.?\)?|\(?nom\.\s?sanct\.?\)?|emend\..*)'
    r'|\b[12]\d{3}\b.{0,3}'
    r'\s*(\(?Fig[^\)]*\)?)?$'
  )

In [33]:
test_cases = [
    "Agaricus bisporus (Lange) 1946",
    "Psalliota campestris (Fr.) 1821",
    "Boletus edulis nov. sp.",
    "Fungus Xaceae (nom. prov.)",
    "Agaricus campestris (L.) Fig. 2",
    "Boletus edulis 2001, 2003"
]

for text in test_cases:
    match = NOMENCLATURE_RE.match(text)
    print(f"✅ Match: {text}" if match else f"❌ No match: {text}")

✅ Match: Agaricus bisporus (Lange) 1946
✅ Match: Psalliota campestris (Fr.) 1821
✅ Match: Boletus edulis nov. sp.
✅ Match: Fungus Xaceae (nom. prov.)
❌ No match: Agaricus campestris (L.) Fig. 2
✅ Match: Boletus edulis 2001, 2003


In [34]:
def heuristically_id_paragraphs(lines):
  aggregated = []
  temp = []
  inside_table = False
  inside_figure = False

  for line in lines:

    #if Table flag true, handle lines until table has ended
    if inside_table:
      #if line is short add it to the table
      if len(line.strip())<45:
        temp.append(line)
        continue
      else: # alternatively check if table lines are all long, if so the table continues and that line should be appended
        if all(len(sentence.strip())>45 for sentence in temp):
          temp.append(line)
          continue
      #if it doesn't fit either of these criteria, then table has ended
      inside_table = False
      #table is considered interstitial so commenting out adding table to paragraphs because we don't actually want those
      #aggregated.append(" ".join(temp))
      temp = [line]
      continue

    #if Figure flag is true, handle lines until figure has ended
    if inside_figure:
      #continue appending until period, colon, or blank line is detected
      if not (line.strip().endswith(".") or line.strip().endswith(":") or line.strip() == ""):
        temp.append(line)
      else:
        inside_figure = False
        #if temp:
          #same as above with table.  We don't actually want the information from tables so not adding it to aggregated
          #aggregated.append(" ".join(temp))
        temp = [line]
        continue

    #Detect Nomenclature
    if NOMENCLATURE_RE.match(line):
      #store the previous paragraph in aggregated (if it exists)
      if temp:
        aggregated.append(" ".join(temp))
      #start a new paragraph
      temp = [line]
      continue
    #page break
    if line.startswith('\f'):
      if temp:
        aggregated.append(" ".join(temp))
      temp = [line]
      continue

    #Leading Tab
    if line.startswith('\t'):
      if temp:
        aggregated.append(" ".join(temp))
      temp = [line]
      continue

    #Detect Table
    if any(line.lower().startswith(keyword) for keyword in TABLE_KEYWORDS):
      #flag table
      inside_table = True
      if temp:
        aggregated.append(" ".join(temp))
      temp = [line]
      continue

    # Blank Line ends a paragraph
    if line.strip() == "":
      #check if previous line was blank, if so skip this line without appending it
      if temp and temp[-1] == "":
        continue
      else: #start a new paragraph
        if temp:
          aggregated.append(" ".join(temp))
        temp = [line]
        continue

    #Detect a Figure
    if any(line.lower().startswith(keyword) for keyword in FIGURE_KEYWORDS):
      if temp:
        aggregated.append(" ".join(temp))
      temp = [line]
      inside_figure = True
      continue

    # Detect a hypthen
    if line.startswith('-'):
      if temp:
        aggregated.append(" ".join(temp))
      temp = [line]
      continue

    #Detect Synonym (ends the paragraph so this line should end up in the current paragraph and a new empty paragraph will start)
    if re.search(r'\([Ss]yn.*\)$', line):
      temp.append(line)
      if temp:
        aggregated.append(" ".join(temp))
      temp = []
      continue

    #Detect Taxon (same as above, taxon ends the paragraph)
    if re.search(taxon_pattern, line):
      temp.append(line)
      if temp:
        aggregated.append(" ".join(temp))
      temp = []
      continue

    #short line ends a paragraph
    if len(line.strip()) < 45:
      temp.append(line)
      if temp:
        aggregated.append(" ".join(temp))
      temp = []
      continue

    #if none of the above triggered a new paragraph, then line belongs to this paragraph and should be appended
    temp.append(line)
  return aggregated




In [36]:
#Define UDF
heuristic_paragraph_udf = udf(heuristically_id_paragraphs, ArrayType(StringType()))

In [45]:
window_spec = Window.partitionBy("filename").orderBy("start_line_id")

grouped_df = (
    df
    .orderBy("filename","line_id")
    .groupBy("filename") #process files together
    .agg(collect_list("value").alias("lines"), min("line_id").alias("start_line_id")) #makes each file one row in spark with the field 'lines' containing a list of all the individual lines
    .withColumn("value", explode(heuristic_paragraph_udf(col("lines")))) #applies udf, creates new line for each paragraph identified by udf
    .drop("lines")
    .filter(trim(col("value")) != "") #drop paragraphs that are only white space
    .withColumn("row_number", row_number().over(window_spec))  # TODO: confirm that this works.
    )

In [46]:
grouped_df.count()

210439

#Pipeline for Trained Models Against Raw Processed Texts

In [47]:
#Use already fit pipeline to tokenize raw paragraphs
raw_features = pipeline_model.transform(grouped_df)

# Making predictions on the raw dataset using logistic model
raw_predictions = log_model.transform(raw_features)


In [50]:
pipeline_model.stages[3].labels

['Misc-exposition', 'Description', 'Nomenclature']

In [51]:
raw_predictions.select("prediction", "value").show(50, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------