In [3]:
# Imports

import pandas as pd

from pathlib import Path

In [10]:
# Paths

processed_abstracts_path = Path("../../data/processed/abstracts")
save_path = processed_abstracts_path / "total_results"

ml_methods_path = processed_abstracts_path / "regex_scispacy/abstracts_with_ml_methods.csv"
ml_category_path = processed_abstracts_path / "scibert_2/abstracts_with_predictions.csv"
production_category_path = processed_abstracts_path / "umap_kmean/abstracts_with_clusters.csv"

# Ensure directories exist
for p in [processed_abstracts_path, save_path]:
    p.mkdir(parents=True, exist_ok=True)

print("All directories verified/created.")

All directories verified/created.


In [11]:
# Load Data

# ML-methods
df_ml_methods = pd.read_csv(ml_methods_path)

# ML-category
df_ml_category = pd.read_csv(ml_category_path)

# Production category
df_prod_category = pd.read_csv(production_category_path)

In [15]:
# Check columns
print(df_ml_methods.columns)
print(df_ml_category.columns)
print(df_prod_category.columns)

Index(['query_id', 'eid', 'doi', 'title', 'abstract', 'clean_abs',
       'ml_methods_regex', 'method_count', 'ml_methods_scispacy'],
      dtype='object')
Index(['query_id', 'eid', 'doi', 'title', 'abstract', 'clean_abs',
       'is_supervised', 'is_unsupervised', 'is_reinforcement',
       'pred_is_supervised', 'pred_is_unsupervised', 'pred_is_reinforcement'],
      dtype='object')
Index(['query_id', 'eid', 'doi', 'title', 'abstract', 'clean_abs', 'umap_x',
       'umap_y', 'kmeans_cluster'],
      dtype='object')


In [16]:
# Check head of each dataset
df_ml_methods.head(2)

Unnamed: 0,query_id,eid,doi,title,abstract,clean_abs,ml_methods_regex,method_count,ml_methods_scispacy
0,ml_end_of_life,2-s2.0-105019728098,10.1016/B978-0-443-33740-6.00012-8,Blockchain-enabled decision system for reliabl...,© 2026 Elsevier Inc. All rights reserved.As th...,All rights reserved.As the production and cons...,[],0,[]
1,ml_end_of_life,2-s2.0-105018918299,10.1080/19397038.2025.2563271,Systematic review of data modelling methods fo...,© 2025 The Author(s). Published by Informa UK ...,"Published by Informa UK Limited, trading as Ta...",[],0,[]


In [17]:
df_ml_category.head(2)

Unnamed: 0,query_id,eid,doi,title,abstract,clean_abs,is_supervised,is_unsupervised,is_reinforcement,pred_is_supervised,pred_is_unsupervised,pred_is_reinforcement
0,ml_anomaly_detection_production,2-s2.0-105018574505,10.1016/j.measurement.2025.119261,Distillation anomaly and fault detection based...,© 2025 The Author(s)The detection of anomalies...,The detection of anomalies in production proce...,True,True,False,1,1,0
1,ml_anomaly_detection_production,2-s2.0-105019192533,10.1007/978-3-032-06118-8_30,From Lab to Factory: Pitfalls and Guidelines f...,"© The Author(s), under exclusive license to Sp...",", under exclusive license to Springer Nature S...",True,True,False,1,1,0


In [19]:
df_prod_category.head(2)

Unnamed: 0,query_id,eid,doi,title,abstract,clean_abs,umap_x,umap_y,kmeans_cluster
0,ml_end_of_life,2-s2.0-105019728098,10.1016/B978-0-443-33740-6.00012-8,Blockchain-enabled decision system for reliabl...,© 2026 Elsevier Inc. All rights reserved.As th...,All rights reserved.As the production and cons...,6.057757,2.316342,7
1,ml_end_of_life,2-s2.0-105018918299,10.1080/19397038.2025.2563271,Systematic review of data modelling methods fo...,© 2025 The Author(s). Published by Informa UK ...,"Published by Informa UK Limited, trading as Ta...",5.117168,4.450449,7


In [23]:
df_ml_category.info()
df_ml_methods.info()
df_prod_category.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32714 entries, 0 to 32713
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   query_id               32714 non-null  object
 1   eid                    32714 non-null  object
 2   doi                    30136 non-null  object
 3   title                  32714 non-null  object
 4   abstract               32714 non-null  object
 5   clean_abs              32714 non-null  object
 6   is_supervised          32714 non-null  bool  
 7   is_unsupervised        32714 non-null  bool  
 8   is_reinforcement       32714 non-null  bool  
 9   pred_is_supervised     32714 non-null  int64 
 10  pred_is_unsupervised   32714 non-null  int64 
 11  pred_is_reinforcement  32714 non-null  int64 
dtypes: bool(3), int64(3), object(6)
memory usage: 2.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33130 entries, 0 to 33129
Data columns (total 9 columns):
 #   Colum

In [None]:
# Merge datasets

ID_COL = "abstract_id"
ML_METHOD_COL = "ml_methods"
ML_CAT_COL = "ml_category"        
PROD_CAT_COL = "prod_cluster"     

# Keep only needed columns before merge
df_ml_methods_sub = df_ml_methods[[ID_COL, ML_METHOD_COL]]
df_ml_cat_sub = df_ml_category[[ID_COL, ML_CAT_COL]]
df_prod_cat_sub = df_prod_category[[ID_COL, PROD_CAT_COL]]

# Merge all on abstract id
df_all = (
    df_ml_methods_sub
    .merge(df_ml_cat_sub, on=ID_COL, how="inner")
    .merge(df_prod_cat_sub, on=ID_COL, how="inner")
)

print(df_all.shape)
df_all.head()
