- Apply rule-based extraction to identify machine learning methods in abstracts
- Test two complementary approaches:
  - **Regex-based matching** for known ML terms
  - **SciSpaCy + EntityRuler** for robust phrase detection

In [1]:
# Imports

from pathlib import Path
import json
import re
import collections
from typing import List
import ast

import pandas as pd
from tqdm import tqdm

import spacy

In [2]:
# Paths

data_path = Path("../../data/short-raw-refs-abs")
processed_abstracts_path = Path("../../data/processed/abstracts")
save_path = processed_abstracts_path / "regex_scispacy"
ml_methods_path = Path("../../ml_methods/ml_methods_dict.json")


# Ensure directories exist
for p in [data_path, processed_abstracts_path, save_path]:
    p.mkdir(parents=True, exist_ok=True)

print("All directories verified/created.")

All directories verified/created.


In [3]:
# Load abstracts dataset

abstracts_path = processed_abstracts_path / "abstracts.csv"
df = pd.read_csv(abstracts_path)

print(f"Loaded {len(df)} abstracts from {df['query_id'].nunique()} queries.")
df.head()

Loaded 52290 abstracts from 24 queries.


Unnamed: 0,query_id,eid,doi,title,abstract,clean_abs
0,ml_anomaly_detection_production,2-s2.0-105018574505,10.1016/j.measurement.2025.119261,Distillation anomaly and fault detection based...,© 2025 The Author(s)The detection of anomalies...,"Indeed, highly efficient systems do not always..."
1,ml_anomaly_detection_production,2-s2.0-105019192533,10.1007/978-3-032-06118-8_30,From Lab to Factory: Pitfalls and Guidelines f...,"© The Author(s), under exclusive license to Sp...",The detection and localization of quality-rela...
2,ml_anomaly_detection_production,2-s2.0-105016669957,10.1007/978-3-032-04200-2_5,Intelligent Defect Detection for Manufacturing...,"© The Author(s), under exclusive license to Sp...","In modern Industry, I4.0, artificial intellige..."
3,ml_anomaly_detection_production,2-s2.0-85218693791,10.1038/s41598-025-90810-w,Hybrid machine learning framework for predicti...,© The Author(s) 2025.The critical necessity fo...,The critical necessity for sophisticated predi...
4,ml_anomaly_detection_production,2-s2.0-105018301117,10.1016/j.comnet.2025.111753,BGP anomaly detection using the raw internet t...,© 2025 The AuthorsThe Border Gateway Protocol ...,"Hence, detecting any anomaly concerning BGP an..."


In [4]:
# Count how many rows each query_id has
query_counts = df["query_id"].value_counts().to_dict()

# Create a copy and map the counts to each row
df = df.copy()
df["query_size"] = df["query_id"].map(query_counts)

# Sort so that query groups with fewer rows are prioritized
df_sorted = df.sort_values(by="query_size", ascending=True)

# Remove duplicate DOIs, keeping the one in the smallest query group
df_dedup = df_sorted.drop_duplicates(subset="doi", keep="first").drop(columns=["query_size"])

# Print results
print("Original dataset size:", len(df))
print("After removing duplicates:", len(df_dedup))
print("Remaining duplicate DOIs:", df_dedup["doi"].duplicated().sum())

df = df_dedup

Original dataset size: 52290
After removing duplicates: 33130
Remaining duplicate DOIs: 0


In [5]:
with open(ml_methods_path, "r", encoding="utf-8") as f:
    ml_methods_dict = json.load(f)


## Regex matching

In [7]:
def extract_methods(text, term_dict):
    # Return empty list if input is not a string
    if not isinstance(text, str):
        return []
    
    found = []
    
    for method, phrases in term_dict.items():
        for phrase in phrases:
            # Escape special characters and enforce word boundaries
            pattern = r"\b" + re.escape(phrase.lower()) + r"\b"
            
            # Perform case-insensitive boundary-safe regex search
            if re.search(pattern, text.lower()):
                found.append(method)
                break  # Stop checking more variants for this method
    
    # Remove duplicates while preserving order
    return list(dict.fromkeys(found))

# Apply extraction with pandas progress bar
tqdm.pandas()
df["ml_methods_regex"] = df["clean_abs"].fillna("").progress_apply(
    lambda x: extract_methods(x, ml_methods_dict)
)


100%|██████████| 33130/33130 [00:48<00:00, 688.75it/s]


In [8]:
# Count how many ML methods were detected per abstract using the regex-based approach
df["method_count"] = df["ml_methods_regex"].str.len()
print(df["method_count"].value_counts().sort_index())

# Show the top 20 most frequently detected ML methods across all abstracts
df.explode("ml_methods_regex")["ml_methods_regex"].value_counts().head(20)

method_count
0     18932
1      7046
2      3763
3      1799
4       889
5       400
6       178
7        78
8        33
9         6
10        3
11        3
Name: count, dtype: int64


ml_methods_regex
Neural Network                  6747
Random Forest                   3103
Support Vector Machine          2176
Decision Tree                   1652
Convolutional Neural Network    1365
Gradient Boosting               1185
LSTM                            1107
XGBoost                         1091
Linear Regression                906
Bayesian Method                  893
Genetic Algorithm                843
K-Nearest Neighbors              813
Support Vector Regression        716
Particle Swarm Optimization      597
Principal Component Analysis     587
Deep Neural Network              483
Logistic Regression              473
SHAP                             460
Multi-Layer Perceptron           420
Gaussian Process Regression      396
Name: count, dtype: int64

In [9]:
# Save updated dataframe with extracted ML methods
output_path = save_path / "abstracts_with_ml_methods_regex.csv"
df.to_csv(output_path, index=False)

print(f"Saved processed abstracts with ML methods to: {output_path}")

Saved processed abstracts with ML methods to: ../../data/processed/abstracts/regex_scispacy/abstracts_with_ml_methods_regex.csv


## Scispacy

In [10]:
# Load the large SciSpaCy scientific language model and list all enabled NLP pipeline components
nlp = spacy.load("en_core_sci_lg")
print(nlp.pipe_names)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']


In [11]:
# Add an EntityRuler to the NLP pipeline and register patterns for ML method detection
ruler = nlp.add_pipe("entity_ruler", before="ner")

patterns = []

# Create token-based matching patterns for each ML method phrase
for label, phrases in ml_methods_dict.items():
    for phrase in phrases:
        # Create a case-insensitive match pattern using LOWER tokens
        pattern = [{"LOWER": token.lower()} for token in phrase.split()]
        patterns.append({
            "label": "ML_METHOD",  # Custom entity label for ML methods
            "pattern": pattern,
            "id": label  # Canonical method name stored in ent_id_
        })

# Register all patterns in the EntityRuler
ruler.add_patterns(patterns)
print(f"Added {len(patterns)} ML_METHOD patterns to EntityRuler.")


Added 64 ML_METHOD patterns to EntityRuler.


In [12]:
# Extract ML method entities from a single abstract using the SciSpaCy EntityRuler
def extract_ml_methods_scispacy(text: str) -> List[str]:
    # Return empty list if input is not valid text
    if not isinstance(text, str) or not text.strip():
        return []
    
    doc = nlp(text)
    methods = []
    
    # Collect all detected ML_METHOD entities with their canonical IDs
    for ent in doc.ents:
        if ent.label_ == "ML_METHOD":
            canonical = ent.ent_id_ if ent.ent_id_ else ent.text
            methods.append(canonical)
    
    # Remove duplicates while preserving order
    unique_methods = list(dict.fromkeys(methods))
    return unique_methods


In [13]:
# Apply the SciSpaCy EntityRuler to extract ML methods for all abstracts efficiently using nlp.pipe
texts = df["clean_abs"].fillna("").tolist()

ml_methods_all = []

# Process abstracts in batches
for doc in tqdm(nlp.pipe(texts, batch_size=32, n_process=8), total=len(texts)):
    methods = []
    
    # Collect ML_METHOD entities found in each processed document
    for ent in doc.ents:
        if ent.label_ == "ML_METHOD":
            canonical = ent.ent_id_ if ent.ent_id_ else ent.text
            methods.append(canonical)
            
    # Store unique detected methods for this abstract
    ml_methods_all.append(list(dict.fromkeys(methods)))

# Save results
df["ml_methods_scispacy"] = ml_methods_all


  0%|          | 0/33130 [00:00<?, ?it/s]

100%|██████████| 33130/33130 [02:01<00:00, 273.04it/s]


In [14]:
# Save updated dataframe with extracted ML methods
output_path = save_path / "abstracts_with_ml_methods.csv"
df.to_csv(output_path, index=False)

print(f"Saved processed abstracts with ML methods to: {output_path}")

Saved processed abstracts with ML methods to: ../../data/processed/abstracts/regex_scispacy/abstracts_with_ml_methods.csv


### SciSpacy: Analyze Abstracts with ML methods 

In [15]:
df = pd.read_csv(save_path / "abstracts_with_ml_methods.csv")

# Convert string lists to lists
df["ml_methods_scispacy"] = df["ml_methods_scispacy"].apply(ast.literal_eval)


In [16]:
# Display the first 10 papers with their extracted ML methods from SciSpaCy
df[["title", "doi", "ml_methods_scispacy"]].head(10)


Unnamed: 0,title,doi,ml_methods_scispacy
0,Blockchain-enabled decision system for reliabl...,10.1016/B978-0-443-33740-6.00012-8,[]
1,Systematic review of data modelling methods fo...,10.1080/19397038.2025.2563271,[]
2,Clustering Locations of Collection Centers in ...,10.1109/TEMSCON-ASPAC62480.2024.11025082,[]
3,"Artificial Intelligence: Basics, Impact, and H...",10.1188/23.CJON.595-601,[]
4,Intersections between materials science and ma...,10.1039/d3va00106g,[]
5,Measuring Carbon in Cities and Their Buildings...,10.3390/asi6050076,[Linear Regression]
6,Predictive modeling for the quantity of recycl...,10.1016/j.resconrec.2023.107073,"[Support Vector Regression, Gradient Boosting,..."
7,Organizational Maturity and Its Influence on P...,10.1007/978-3-031-94484-0_27,[]
8,Toward Sustainable Manufacturing: A Review on ...,10.1109/ACCESS.2025.3576441,[]
9,Integrating Digital Twins and Robotics,,[]


In [17]:
# Count how many ML methods SciSpaCy detected per abstract and summarize the distribution
df["method_count_scispacy"] = df["ml_methods_scispacy"].str.len()
df["method_count_scispacy"].value_counts().sort_index()


method_count_scispacy
0     19301
1      7646
2      3195
3      1593
4       774
5       364
6       152
7        64
8        29
9         6
10        5
11        1
Name: count, dtype: int64

In [18]:
# Show the top 20 most frequently detected ML methods using the SciSpaCy extraction
df.explode("ml_methods_scispacy")["ml_methods_scispacy"].value_counts().head(20)


ml_methods_scispacy
Neural Network                  5580
Random Forest                   3060
Support Vector Machine          2127
Decision Tree                   1614
Convolutional Neural Network    1195
Gradient Boosting               1173
XGBoost                         1053
LSTM                             964
Bayesian Method                  882
Linear Regression                862
K-Nearest Neighbors              798
Genetic Algorithm                748
Support Vector Regression        693
Principal Component Analysis     562
Particle Swarm Optimization      542
Logistic Regression              469
Deep Neural Network              458
SHAP                             445
Multi-Layer Perceptron           391
Gaussian Process Regression      385
Name: count, dtype: int64

In [19]:
# Randomly inspect 5 abstracts where SciSpaCy successfully detected ML methods
df[df["method_count_scispacy"] > 0][["title", "doi", "ml_methods_scispacy"]].sample(5)


Unnamed: 0,title,doi,ml_methods_scispacy
16110,Machine learning-driven process of alumina cer...,10.1088/1402-4896/aca3da,[Neural Network]
21455,Reservoir production optimization based on sur...,10.1016/j.petrol.2021.108879,[Gradient Boosting]
17312,POSSIBILITIES OF APPLYING MACHINE LEARNING TEC...,10.1115/GT2024-122477,"[Neural Network, Deep Neural Network]"
21542,An Optimized Convolution Neural Network Archit...,10.32604/cmc.2022.022215,"[Neural Network, Convolutional Neural Network,..."
22222,Asynchronous Decentralized Bayesian Optimizati...,10.1109/e-Science58273.2023.10254839,"[Bayesian Method, Neural Network]"


## Compare Regex and SciSpacy

In [20]:
# Ensure we are working with list -> set for easier membership checks
df["regex_set"] = df["ml_methods_regex"].apply(lambda x: set(x) if isinstance(x, list) else set())
df["scispacy_set"] = df["ml_methods_scispacy"].apply(lambda x: set(x) if isinstance(x, list) else set())

# Collect the universe of methods seen by either approach
all_methods = sorted(
    set().union(*df["regex_set"]).union(*df["scispacy_set"])
)

rows = []

# For each method, compute overlap stats between regex and SciSpaCy
for method in all_methods:
    regex_mask = df["regex_set"].apply(lambda s: method in s)
    scis_mask = df["scispacy_set"].apply(lambda s: method in s)
    
    regex_count = regex_mask.sum()
    scispacy_count = scis_mask.sum()
    both_count = (regex_mask & scis_mask).sum()
    regex_only = (regex_mask & ~scis_mask).sum()
    scispacy_only = (~regex_mask & scis_mask).sum()
    union_count = (regex_mask | scis_mask).sum()
    
    jaccard = both_count / union_count if union_count > 0 else 0.0
    
    rows.append({
        "method": method,
        "regex_count": regex_count,
        "scispacy_count": scispacy_count,
        "both_count": both_count,
        "regex_only": regex_only,
        "scispacy_only": scispacy_only,
        "union_count": union_count,
        "jaccard_overlap": jaccard,
    })

overlap_df = pd.DataFrame(rows)

# Sort by popularity (or by overlap) as you like
overlap_df_sorted = overlap_df.sort_values("union_count", ascending=False)

# For a quick view: top 20 methods by total detections, with overlap stats
overlap_df_sorted.head(30)

Unnamed: 0,method,regex_count,scispacy_count,both_count,regex_only,scispacy_only,union_count,jaccard_overlap
19,Neural Network,0,5580,0,0,5580,5580,0.0
22,Random Forest,0,3060,0,0,3060,3060,0.0
26,Support Vector Machine,0,2127,0,0,2127,2127,0.0
3,Decision Tree,0,1614,0,0,1614,1614,0.0
2,Convolutional Neural Network,0,1195,0,0,1195,1195,0.0
9,Gradient Boosting,0,1173,0,0,1173,1173,0.0
28,XGBoost,0,1053,0,0,1053,1053,0.0
14,LSTM,0,964,0,0,964,964,0.0
1,Bayesian Method,0,882,0,0,882,882,0.0
15,Linear Regression,0,862,0,0,862,862,0.0


In [21]:
# Extract rows where SciSpaCy found at least one method
# and regex found none
scispacy_only_df = df[
    (df["scispacy_set"].apply(len) > 0) &
    (df["regex_set"].apply(len) == 0)
]

scispacy_only_df.head()

Unnamed: 0,query_id,eid,doi,title,abstract,clean_abs,ml_methods_regex,method_count,ml_methods_scispacy,method_count_scispacy,regex_set,scispacy_set
5,ml_end_of_life,2-s2.0-85174142475,10.3390/asi6050076,Measuring Carbon in Cities and Their Buildings...,© 2023 by the authors.According to the Europea...,"According to the European Green Deal, excessiv...",['Linear Regression'],1,[Linear Regression],1,{},{Linear Regression}
6,ml_end_of_life,2-s2.0-85161673110,10.1016/j.resconrec.2023.107073,Predictive modeling for the quantity of recycl...,© 2023 The Author(s)The rapid development of m...,"However, the Stacking ensemble model is less w...","['Support Vector Regression', 'Linear Regressi...",3,"[Support Vector Regression, Gradient Boosting,...",3,{},"{Gradient Boosting, Support Vector Regression,..."
11,ml_end_of_life,2-s2.0-85171544836,10.30638/eemj.2023.018,END-OF-LIFE VEHICLES ASSESSMENT OF THE AUTOMOB...,© 2023 Gheorghe Asachi Technical University of...,All rights reserved.Automotive industry is hig...,['Neural Network'],1,[Neural Network],1,{},{Neural Network}
12,ml_end_of_life,2-s2.0-85179507482,10.1115/DETC2023-114718,PREDICTING THE QUANTITY OF RECYCLED END-OF-LIF...,© 2023 American Society of Mechanical Engineer...,All rights reserved.End-of-life product recycl...,"['Support Vector Regression', 'Particle Swarm ...",2,"[Particle Swarm Optimization, Support Vector R...",2,{},"{Support Vector Regression, Particle Swarm Opt..."
27,ml_end_of_life,2-s2.0-85218456652,10.1007/978-3-031-69626-8_78,Machine Learning Integration in LCA: Addressin...,© The Author(s) 2025.Life Cycle Assessment (LC...,Life Cycle Assessment (LCA) is an essential to...,['Random Forest'],1,[Random Forest],1,{},{Random Forest}


In [22]:
# Extract rows where Regex found at least one method
# and SciSpacy found none
scispacy_only_df = df[
    (df["regex_set"].apply(len) > 0) &
    (df["scispacy_set"].apply(len) == 0)
]

scispacy_only_df.head()

Unnamed: 0,query_id,eid,doi,title,abstract,clean_abs,ml_methods_regex,method_count,ml_methods_scispacy,method_count_scispacy,regex_set,scispacy_set


## Extract ML-Methods

In [23]:
# Define common "method head" words and extract ML-related candidate phrases from parsed abstracts
METHOD_HEADS = {
    "network", "networks",
    "model", "models",
    "algorithm", "algorithms",
    "classifier", "classifiers",
    "regression",
    "forest", "forests",
    "clustering", "clusterer", "clusterers",
    "encoder", "encoders",
    "autoencoder", "autoencoders",
    "transformer", "transformers",
    "estimators", "estimator",
    "approach", "approaches",
    "architecture", "architectures",
}

def extract_candidate_phrases_from_doc(doc):
    """Extract candidate ML-method phrases from a spaCy Doc."""
    candidates = []

    # Noun chunks that end with a "method head" word
    for chunk in doc.noun_chunks:
        tokens = [t for t in chunk if not t.is_punct]
        if not tokens:
            continue

        head = tokens[-1].lemma_.lower()
        if head in METHOD_HEADS:
            if 1 <= len(tokens) <= 7:
                phrase = chunk.text.strip()
                candidates.append(phrase)

    # All-caps acronyms (e.g., SVM, CNN, LSTM)
    for token in doc:
        if (
            token.is_alpha
            and token.text.isupper()
            and 2 <= len(token.text) <= 6
        ):
            candidates.append(token.text)

    # Remove duplicates, preserve order
    return list(dict.fromkeys(candidates))


In [24]:
# Extract ML-related candidate phrases from all abstracts efficiently using spaCy's nlp.pipe
texts = df["clean_abs"].fillna("").tolist()

candidate_counter = collections.Counter()

# Batch processing with multiple processes for faster NLP execution
for doc in tqdm(
    nlp.pipe(texts, batch_size=64, n_process=8),
    total=len(texts)
):
    for cand in extract_candidate_phrases_from_doc(doc):
        candidate_counter[cand] += 1

print(f"Extracted {len(candidate_counter)} unique candidate phrases.")


  0%|          | 0/33130 [00:00<?, ?it/s]

100%|██████████| 33130/33130 [02:03<00:00, 267.88it/s]

Extracted 40432 unique candidate phrases.





In [25]:
# Show the top candidate phrases by frequency
for phrase, count in candidate_counter.most_common(100):
    print(f"{count:4d}  {phrase}")


5519  ML
3704  AI
1268  SVM
1243  RMSE
1182  the model
1139  ANN
 967  RF
 964  LSTM
 955  RUL
 892  AM
 860  CNN
 842  The model
 795  This approach
 765  MAE
 747  machine learning algorithms
 547  MDPI
 541  machine learning models
 532  DL
 530  DT
 511  Random Forest
 497  models
 470  KNN
 461  MSE
 460  SVR
 443  SHAP
 430  The approach
 425  a novel approach
 402  an approach
 402  a model
 389  MLP
 374  this approach
 368  The proposed approach
 356  MAPE
 352  PCA
 346  our approach
 335  the models
 334  PSO
 322  CC
 320  random forest
 307  GA
 296  the proposed approach
 293  Our approach
 287  the proposed model
 286  The models
 285  algorithms
 274  The proposed model
 268  This model
 265  the approach
 263  RL
 259  These models
 255  these models
 246  a machine learning model
 243  DNN
 241  neural networks
 233  PV
 233  IEEE
 231  ML models
 225  GPR
 220  predictive models
 216  NASA
 213  LR
 213  artificial neural networks
 211  CNC
 209  CPS
 206  PHM
 202  