In [36]:
# Imports

import pandas as pd
from pathlib import Path
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from collections import Counter
import ast


In [2]:
# Paths

processed_abstracts_path = Path("../../data/processed/abstracts")

# Dataset with cluster + ml-paradigm + ml-methods
cluster_path = processed_abstracts_path / "total_results" / "kmeans_scibert.csv"

# Dataset with PLC + ml-paradigm + ml-methods 
plc_path = processed_abstracts_path / "total_results" / "plc_scibert.csv"

save_path = processed_abstracts_path / "roadmap"

# Ensure directories exist
for p in [processed_abstracts_path, save_path]:
    p.mkdir(parents=True, exist_ok=True)

print("All directories verified/created.")


All directories verified/created.


In [3]:
# Load Datasets

df_cluster = pd.read_csv(cluster_path)
df_plc = pd.read_csv(plc_path)
df_abstracts = pd.read_csv(processed_abstracts_path / "abstracts.csv")

In [4]:
# Remove abstract duplicates

# Count how many rows each query_id has
query_counts = df_abstracts["query_id"].value_counts().to_dict()

# Create a copy and map the counts to each row
df_abstracts = df_abstracts.copy()
df_abstracts["query_size"] = df_abstracts["query_id"].map(query_counts)

# Sort so that query groups with fewer rows are prioritized
df_abstracts_sorted = df_abstracts.sort_values(by="query_size", ascending=True)

# Remove duplicate DOIs, keeping the one in the smallest query group
df_abstracts_dedup = df_abstracts_sorted.drop_duplicates(subset="doi", keep="first").drop(columns=["query_size"])

# Print results
print("Original dataset size:", len(df_abstracts))
print("After removing duplicates:", len(df_abstracts_dedup))
print("Remaining duplicate DOIs:", df_abstracts_dedup["doi"].duplicated().sum())

df_abstracts = df_abstracts_dedup

Original dataset size: 52290
After removing duplicates: 33130
Remaining duplicate DOIs: 0


In [5]:
df_cluster.info()
df_cluster.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35612 entries, 0 to 35611
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     35612 non-null  int64 
 1   doi            35612 non-null  object
 2   ml_method      35612 non-null  object
 3   ml_category    35612 non-null  object
 4   prod_category  35612 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.4+ MB


Unnamed: 0.1,Unnamed: 0,doi,ml_method,ml_category,prod_category
0,0,10.3390/asi6050076,Linear Regression,supervised,3
1,1,10.1016/j.resconrec.2023.107073,Linear Regression,supervised,7


In [6]:
df_plc.info()
df_plc.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35689 entries, 0 to 35688
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   35689 non-null  int64 
 1   doi          35689 non-null  object
 2   ml_method    35689 non-null  object
 3   ml_category  35689 non-null  object
 4   phase        35689 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.4+ MB


Unnamed: 0.1,Unnamed: 0,doi,ml_method,ml_category,phase
0,0,10.3390/asi6050076,Linear Regression,supervised,1
1,1,10.1016/j.resconrec.2023.107073,Linear Regression,supervised,4


In [7]:
df_abstracts.info()
df_abstracts.head(2)

<class 'pandas.core.frame.DataFrame'>
Index: 33130 entries, 3373 to 4329
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query_id   33130 non-null  object
 1   eid        33130 non-null  object
 2   doi        33129 non-null  object
 3   title      33130 non-null  object
 4   abstract   33130 non-null  object
 5   clean_abs  33129 non-null  object
dtypes: object(6)
memory usage: 1.8+ MB


Unnamed: 0,query_id,eid,doi,title,abstract,clean_abs
3373,ml_end_of_life,2-s2.0-105019728098,10.1016/B978-0-443-33740-6.00012-8,Blockchain-enabled decision system for reliabl...,© 2026 Elsevier Inc. All rights reserved.As th...,All rights reserved.As the production and cons...
3374,ml_end_of_life,2-s2.0-105018918299,10.1080/19397038.2025.2563271,Systematic review of data modelling methods fo...,© 2025 The Author(s). Published by Informa UK ...,"Published by Informa UK Limited, trading as Ta..."


In [21]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

In [22]:
tokenizer = model.tokenizer  # tokenizer associated with MPNet model
max_len = model.get_max_seq_length()  # typically 384 for MPNet

def embed_with_chunking(text):
    tokens = tokenizer.tokenize(text)

    # If below limit -> embed normally
    if len(tokens) <= max_len:
        return model.encode(
            text,
            convert_to_numpy=True,
            normalize_embeddings=True,
        )

    # Split into chunks that fit model's input size
    chunks = [
        tokenizer.convert_tokens_to_string(tokens[i:i+max_len])
        for i in range(0, len(tokens), max_len)
    ]

    # Embed each chunk and average
    chunk_embeddings = model.encode(
        chunks,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )
    return np.mean(chunk_embeddings, axis=0)




## Embedding of abstracts - no need to run this multiple times

In [None]:
# # Batch embedding loop with chunking 

# texts = df_abstracts["clean_abs"].astype(str).tolist()
# batch_size = 512

# embeddings = []

# for start in tqdm(range(0, len(texts), batch_size), desc="Embedding abstracts"):
#     batch = texts[start:start+batch_size]

#     batch_emb = [embed_with_chunking(text) for text in batch]
#     embeddings.append(np.vstack(batch_emb))

# embeddings = np.vstack(embeddings)

# print("Embedding matrix shape:", embeddings.shape)


In [None]:
# # Save embeddings to .npy file
# emb_path = save_path / "abstract_embeddings.npy"
# np.save(emb_path, embeddings)
# print("Saved embeddings to:", emb_path)

Saved embeddings to: ../../data/processed/abstracts/roadmap/abstract_embeddings.npy


## Embedding of Titles - no need to run this multiple times

In [24]:
# Embed titles
title_texts = df_abstracts["title"].astype(str).tolist()
batch_size = 512

title_embeddings = []
for start in tqdm(range(0, len(title_texts), batch_size), desc="Embedding titles"):
    batch = title_texts[start:start+batch_size]
    batch_emb = model.encode(
        batch,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )
    title_embeddings.append(batch_emb)

title_embeddings = np.vstack(title_embeddings)
np.save(save_path / "title_embeddings.npy", title_embeddings)


Embedding titles: 100%|██████████| 65/65 [00:16<00:00,  3.94it/s]


## Build roadmap dataframe - no need to run this multiple times

In [None]:
# # Build Roadmap DataFrame

# # Load embeddings and attach to abstracts (same row order as df_abstracts)
# abstract_embeddings = np.load(save_path / "abstract_embeddings.npy")
# df_abs = df_abstracts[["doi", "title", "clean_abs"]].copy()
# df_abs["emb_idx"] = np.arange(len(df_abs))

# # Build per-doi attributes from your labeled outputs
# plc_agg = (df_plc.groupby("doi")
#            .agg(phase=("phase", lambda s: s.mode().iloc[0] if not s.mode().empty else np.nan),
#                 ml_category=("ml_category", lambda s: s.mode().iloc[0] if not s.mode().empty else np.nan),
#                 ml_methods_plc=("ml_method", lambda s: sorted(set(map(str, s.dropna())))))
#            .reset_index())

# clu_agg = (df_cluster.groupby("doi")
#            .agg(prod_category=("prod_category", lambda s: s.mode().iloc[0] if not s.mode().empty else np.nan),
#                 ml_methods_cluster=("ml_method", lambda s: sorted(set(map(str, s.dropna())))))
#            .reset_index())

# # Merge into one roadmap dataframe (one row per doi)
# df_rm = df_abs.merge(plc_agg, on="doi", how="left").merge(clu_agg, on="doi", how="left")

# def _as_list(x):
#     # Convert NaN/None to []
#     if x is None:
#         return []
#     if isinstance(x, float) and np.isnan(x):
#         return []
#     return x if isinstance(x, list) else [str(x)]

# # Use only one source for ml_methods (pick plc here)
# df_rm["ml_methods"] = df_rm["ml_methods_plc"].apply(_as_list)

# # Drop the extra columns
# df_rm = df_rm.drop(columns=["ml_methods_plc", "ml_methods_cluster"], errors="ignore")

# # Keep only abstracts that have labels in BOTH df_plc and df_cluster
# valid_dois = set(df_plc["doi"].dropna().unique()) & set(df_cluster["doi"].dropna().unique())

# df_rm = df_rm[df_rm["doi"].isin(valid_dois)].reset_index(drop=True)

# df_rm["phase"] = df_rm["phase"].astype("Int64")
# df_rm["prod_category"] = df_rm["prod_category"].astype("Int64")

# # Keep embeddings aligned with filtered df_rm
# E_abs = abstract_embeddings[df_rm["emb_idx"].to_numpy()]

# print("Filtered df_rm rows:", len(df_rm), "| unique dois:", df_rm["doi"].nunique())

# print("df_rm rows:", len(df_rm), "| unique dois:", df_rm["doi"].nunique())
# df_rm.head(2)


Filtered df_rm rows: 15205 | unique dois: 15205
df_rm rows: 15205 | unique dois: 15205


Unnamed: 0,doi,title,clean_abs,emb_idx,phase,ml_category,prod_category,ml_methods
0,10.3390/asi6050076,Measuring Carbon in Cities and Their Buildings...,"According to the European Green Deal, excessiv...",5,1,supervised,3,[Linear Regression]
1,10.1016/j.resconrec.2023.107073,Predictive modeling for the quantity of recycl...,"However, the Stacking ensemble model is less w...",6,4,supervised,7,"[Gradient Boosting, Linear Regression, Support..."


In [None]:
# # Save Roadmap DataFrame and aligned embeddings
# rm_path = save_path / "roadmap_dataset.csv"
# df_rm.to_csv(rm_path, index=False)
# print("Saved roadmap dataset to:", rm_path)

Saved roadmap dataset to: ../../data/processed/abstracts/roadmap/roadmap_dataset.csv


In [27]:
# Load embeddings and roadmap dataframe
abstract_embeddings = np.load(save_path / "abstract_embeddings.npy")
title_embeddings = np.load(save_path / "title_embeddings.npy")

df_rm = pd.read_csv(save_path / "roadmap_dataset.csv")
E_abs = abstract_embeddings[df_rm["emb_idx"].to_numpy()]
E_title = title_embeddings[df_rm["emb_idx"].to_numpy()]

In [29]:
# Check that embeddings align with df_rm
print("Abstract embeddings shape:", E_abs.shape)
print("Title embeddings shape:", E_title.shape)


Abstract embeddings shape: (15205, 768)
Title embeddings shape: (15205, 768)


In [30]:
# Build a discrete location key
df_rm["loc_key"] = list(zip(df_rm["phase"], df_rm["prod_category"], df_rm["ml_category"].astype(str)))

# Fast lookup: loc_key -> row indices
loc_index = df_rm.groupby("loc_key").indices

print("Unique locations:", len(loc_index))


Unique locations: 151


In [37]:
# If df_rm is loaded from CSV, parse the ml_methods column to make it lists again, not strings

def parse_list_cell(x):
    # Convert "['a','b']" -> ['a','b']
    if isinstance(x, list):
        return x
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, str):
        s = x.strip()
        if s == "" or s.lower() in ["nan", "none"]:
            return []
        try:
            v = ast.literal_eval(s)
            return v if isinstance(v, list) else [str(v)]
        except Exception:
            # Fallback: treat as single method string
            return [s]
    return [str(x)]

df_rm["ml_methods"] = df_rm["ml_methods"].apply(parse_list_cell)

In [55]:
def recommend_same_location(
    query_text,
    query_phase,
    query_cluster,
    query_ml_category,
    w_title=0.25,
    topk=5,
):
    key = (query_phase, query_cluster, str(query_ml_category))
    idxs = loc_index.get(key, [])

    if len(idxs) == 0:
        return df_rm.iloc[0:0].copy()

    q = embed_with_chunking(str(query_text))

    idxs = np.array(list(idxs), dtype=int)

    # Similarity to title and abstract
    sim_title = E_title[idxs] @ q
    sim_abs = E_abs[idxs] @ q

    sim = w_title * sim_title + (1 - w_title) * sim_abs

    order = np.argsort(sim)[::-1][:topk]
    out = df_rm.iloc[idxs[order]].copy()
    out["sim_title"] = sim_title[order]
    out["sim_abstract"] = sim_abs[order]
    out["sim_total"] = sim[order]

    return out[
        ["doi", "title", "phase", "prod_category", "ml_category",
         "ml_methods", "sim_title", "sim_abstract", "sim_total"]
    ]


In [51]:
def summarize_methods(recs, topn=8):
    # Count ml methods in recommended papers
    methods = []
    for xs in recs["ml_methods"].tolist():
        if isinstance(xs, list):
            methods.extend(xs)
    return Counter(methods).most_common(topn)


In [52]:
def roadmap_same_location(
    query_text,
    query_phase,
    query_cluster,
    query_ml_category,
    topk=5,
):
    recs = recommend_same_location(
        query_text=query_text,
        query_phase=query_phase,
        query_cluster=query_cluster,
        query_ml_category=query_ml_category,
        topk=topk,
    )
    methods = summarize_methods(recs) if len(recs) > 0 else []
    return recs, methods


In [56]:
query_text = "Predictive maintenance for car engine"

recs, top_methods = roadmap_same_location(
    query_text=query_text,
    query_phase=3,
    query_cluster=8,
    query_ml_category="supervised",
    topk=5,
)

display(recs)
print("Recommended ML methods:", top_methods)


Unnamed: 0,doi,title,phase,prod_category,ml_category,ml_methods,sim_title,sim_abstract,sim_total
3468,10.1016/j.engappai.2025.111199,Optimizing aircraft engine longevity: A compar...,3,8,supervised,[Gaussian Process Regression],0.682577,0.753321,0.735635
8483,10.1063/5.0173800,Short Term Predictive Maintenance Using Machin...,3,8,supervised,"[Decision Tree, Linear Regression]",0.738961,0.676061,0.691786
8907,10.1109/DeSE.2019.00181,A jet engine prognostic and diagnostic system ...,3,8,supervised,[Bayesian Method],0.626479,0.713551,0.691783
8718,10.1007/978-981-16-2183-3_65,Predictive Maintenance for a Turbofan Engine U...,3,8,supervised,[Support Vector Machine],0.775021,0.663278,0.691214
8344,10.1007/978-3-031-65522-7_9,Prediction of Automotive Vehicles Engine Healt...,3,8,supervised,"[Logistic Regression, Multi-Layer Perceptron]",0.670471,0.68869,0.684135


Recommended ML methods: [('Gaussian Process Regression', 1), ('Decision Tree', 1), ('Linear Regression', 1), ('Bayesian Method', 1), ('Support Vector Machine', 1), ('Logistic Regression', 1), ('Multi-Layer Perceptron', 1)]
