- Apply rule-based extraction to identify machine learning methods in abstracts
- Test two complementary approaches:
  - **Regex-based matching** for known ML terms
  - **SciSpaCy + EntityRuler** for robust phrase detection

In [1]:
# Imports

import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from tqdm import tqdm
import collections
import spacy

In [2]:
# Paths

data_path = Path("../../data/short-raw-refs-abs")
processed_abstracts_path = Path("../../data/processed/abstracts")
save_path = processed_abstracts_path / "scispacy"


# Ensure directories exist
for p in [data_path, processed_abstracts_path, save_path]:
    p.mkdir(parents=True, exist_ok=True)

print("All directories verified/created.")

All directories verified/created.


In [3]:
# Load abstracts dataset

abstracts_path = processed_abstracts_path / "abstracts.csv"
df = pd.read_csv(abstracts_path)

print(f"Loaded {len(df)} abstracts from {df['query_id'].nunique()} queries.")
df.head()

Loaded 52290 abstracts from 24 queries.


Unnamed: 0,query_id,eid,doi,title,abstract,clean_abs
0,ml_anomaly_detection_production,2-s2.0-105018574505,10.1016/j.measurement.2025.119261,Distillation anomaly and fault detection based...,© 2025 The Author(s)The detection of anomalies...,"Indeed, highly efficient systems do not always..."
1,ml_anomaly_detection_production,2-s2.0-105019192533,10.1007/978-3-032-06118-8_30,From Lab to Factory: Pitfalls and Guidelines f...,"© The Author(s), under exclusive license to Sp...",The detection and localization of quality-rela...
2,ml_anomaly_detection_production,2-s2.0-105016669957,10.1007/978-3-032-04200-2_5,Intelligent Defect Detection for Manufacturing...,"© The Author(s), under exclusive license to Sp...","In modern Industry, I4.0, artificial intellige..."
3,ml_anomaly_detection_production,2-s2.0-85218693791,10.1038/s41598-025-90810-w,Hybrid machine learning framework for predicti...,© The Author(s) 2025.The critical necessity fo...,The critical necessity for sophisticated predi...
4,ml_anomaly_detection_production,2-s2.0-105018301117,10.1016/j.comnet.2025.111753,BGP anomaly detection using the raw internet t...,© 2025 The AuthorsThe Border Gateway Protocol ...,"Hence, detecting any anomaly concerning BGP an..."


In [4]:
# Count how many rows each query_id has
query_counts = df["query_id"].value_counts().to_dict()

# Create a copy and map the counts to each row
df = df.copy()
df["query_size"] = df["query_id"].map(query_counts)

# Sort so that query groups with fewer rows are prioritized
df_sorted = df.sort_values(by="query_size", ascending=True)

# Remove duplicate DOIs, keeping the one in the smallest query group
df_dedup = df_sorted.drop_duplicates(subset="doi", keep="first").drop(columns=["query_size"])

# Print results
print("Original dataset size:", len(df))
print("After removing duplicates:", len(df_dedup))
print("Remaining duplicate DOIs:", df_dedup["doi"].duplicated().sum())

df = df_dedup

Original dataset size: 52290
After removing duplicates: 33130
Remaining duplicate DOIs: 0


In [9]:
ml_methods_dict = {
    "Principal Component Analysis": ["principal component analysis", "pca"],
    "Support Vector Machine": ["support vector machine", "support vector machines", "svm"],
    "Random Forest": ["random forest", "random forests"],
    "Neural Network": ["neural network", "neural networks", "nn"],
    "Convolutional Neural Network": ["convolutional neural network", "cnn", "convolutional neural networks"],
    "Recurrent Neural Network": ["recurrent neural network", "rnn", "recurrent neural networks"],
    "LSTM": ["lstm"],
    "Autoencoder": ["autoencoder", "auto-encoder", "auto encoder"],
    "K-Means": ["k-means", "k means", "kmeans"],
    "Decision Tree": ["decision tree", "decision trees"],
    "Bayesian Method": ["bayesian", "bayesian inference", "bayesian network", "bayesian networks"],
    "Reinforcement Learning": ["reinforcement learning", "rl"],
    "Gaussian Process": ["gaussian process", "gaussian processes"],
    "Logistic Regression": ["logistic regression"],
    "Linear Regression": ["linear regression"],
    "Isolation Forest": ["isolation forest"],
    "Local Outlier Factor": ["local outlier factor", "lof"],
    "Hotelling T2": ["hotelling's t2", "hotelling t2"],
    "Squared Prediction Error": ["squared prediction error", "spe"],
}


## Regex matching

In [None]:
def extract_methods(text, term_dict):
    # Ensure input is a string, return empty list otherwise
    if not isinstance(text, str):
        return []
    
    found = []
    # Convert text to lowercase for case-insensitive matching
    text = text.lower()
    
    for method, phrases in term_dict.items():
        for phrase in phrases:
            # Case-insensitive substring match for each phrase variant
            if phrase.lower() in text:
                found.append(method)
                break  # Stop checking additional phrases once one is matched
    
    # Remove duplicates while preserving original order
    return list(dict.fromkeys(found))


# Apply method extraction to each abstract using pandas progress bar
tqdm.pandas()
df["ml_methods"] = df["clean_abs"].fillna("").progress_apply(
    lambda x: extract_methods(x, ml_methods_dict)
)


100%|██████████| 33130/33130 [00:01<00:00, 18277.31it/s]


In [12]:
df["method_count"] = df["ml_methods"].str.len()
print(df["method_count"].value_counts().sort_index())

df.explode("ml_methods")["ml_methods"].value_counts().head(20)


method_count
0      5050
1     10833
2      9800
3      4811
4      1717
5       649
6       193
7        56
8        14
9         2
10        5
Name: count, dtype: int64


ml_methods
Neural Network                  16234
Squared Prediction Error        15753
Reinforcement Learning          10295
Random Forest                    2968
Support Vector Machine           2201
Convolutional Neural Network     1729
Decision Tree                    1357
LSTM                             1196
Bayesian Method                   894
Linear Regression                 819
Gaussian Process                  627
Principal Component Analysis      600
Autoencoder                       546
Recurrent Neural Network          491
Logistic Regression               477
K-Means                           347
Isolation Forest                  109
Local Outlier Factor               52
Hotelling T2                       14
Name: count, dtype: int64

## Scispacy

In [8]:
nlp = spacy.load("en_core_sci_lg")
print(nlp.pipe_names)

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [None]:
from spacy.pipeline import EntityRuler

ruler = nlp.add_pipe("entity_ruler", before="ner")

patterns = []

for label, phrases in ml_methods_dict.items():
    for phrase in phrases:
        pattern = [{"LOWER": token.lower()} for token in phrase.split()]
        patterns.append({
            "label": "ML_METHOD",          
            "pattern": pattern,
            "id": label                     
        })

ruler.add_patterns(patterns)
print(f"Added {len(patterns)} ML_METHOD patterns to EntityRuler.")


Added 42 ML_METHOD patterns to EntityRuler.


In [None]:
from typing import List, Dict
import numpy as np

def extract_ml_methods_scispacy(text: str) -> List[str]:
    if not isinstance(text, str) or not text.strip():
        return []
    
    doc = nlp(text)
    methods = []
    
    for ent in doc.ents:
        if ent.label_ == "ML_METHOD":
            canonical = ent.ent_id_ if ent.ent_id_ else ent.text
            methods.append(canonical)
    
    unique_methods = list(dict.fromkeys(methods))
    return unique_methods


In [15]:
sample_text = df["clean_abs"].dropna().iloc[0]
extract_ml_methods_scispacy(sample_text)


[]

In [16]:
from tqdm import tqdm

texts = df["clean_abs"].fillna("").tolist()

ml_methods_all = []

for doc in tqdm(nlp.pipe(texts, batch_size=32, n_process=1), total=len(texts)):
    methods = []
    for ent in doc.ents:
        if ent.label_ == "ML_METHOD":
            canonical = ent.ent_id_ if ent.ent_id_ else ent.text
            methods.append(canonical)
    ml_methods_all.append(list(dict.fromkeys(methods)))

df["ml_methods_scispacy"] = ml_methods_all


100%|██████████| 33130/33130 [11:37<00:00, 47.49it/s]


In [17]:
df[["title", "ml_methods_scispacy"]].head(10)


Unnamed: 0,title,ml_methods_scispacy
3373,Blockchain-enabled decision system for reliabl...,[]
3374,Systematic review of data modelling methods fo...,[]
3387,Clustering Locations of Collection Centers in ...,[Reinforcement Learning]
3388,"Artificial Intelligence: Basics, Impact, and H...",[]
3389,Intersections between materials science and ma...,[]
3390,Measuring Carbon in Cities and Their Buildings...,[Linear Regression]
3391,Predictive modeling for the quantity of recycl...,[Linear Regression]
3375,Organizational Maturity and Its Influence on P...,[]
3376,Toward Sustainable Manufacturing: A Review on ...,[]
3377,Integrating Digital Twins and Robotics,[]


In [18]:
df["method_count_scispacy"] = df["ml_methods_scispacy"].str.len()
df["method_count_scispacy"].value_counts().sort_index()


method_count_scispacy
0    20218
1     8495
2     2825
3     1102
4      376
5       89
6       16
7        6
8        2
9        1
Name: count, dtype: int64

In [19]:
df.explode("ml_methods_scispacy")["ml_methods_scispacy"].value_counts().head(20)


ml_methods_scispacy
Neural Network                  5151
Random Forest                   2932
Support Vector Machine          2127
Convolutional Neural Network    1528
Decision Tree                   1330
LSTM                             964
Reinforcement Learning           947
Bayesian Method                  882
Linear Regression                750
Gaussian Process                 614
Principal Component Analysis     562
Logistic Regression              469
Recurrent Neural Network         429
Autoencoder                      378
K-Means                          333
Isolation Forest                 103
Local Outlier Factor              42
Squared Prediction Error          16
Hotelling T2                       6
Name: count, dtype: int64