## Compare KMeans (applications) and PLC clustering results

In [13]:
from pathlib import Path
from collections import Counter
import json

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

In [3]:
# Paths

processed_abstracts_path = Path("../../data/processed/abstracts")

plc_path = processed_abstracts_path / "semantic_analysis_ext_desc/abstracts_min_phases.csv"
kmeans_path = processed_abstracts_path / "umap_kmean_tuned/abstracts_with_clusters.csv"

keywords_plc_path = processed_abstracts_path / "semantic_analysis_ext_desc/keywords.json"
keywords_kmeans_path = processed_abstracts_path / "umap_kmean_tuned/cluster_keywords.json"


In [4]:
# Load Data 

# PLC
df_plc = pd.read_csv(plc_path)

# KMeans
df_kmeans = pd.read_csv(kmeans_path)

# PLC keywords
with open(keywords_plc_path, "r", encoding="utf-8") as f:
    plc_keywords = json.load(f)
    
# KMeans keywords
with open(keywords_kmeans_path, "r", encoding="utf-8") as f:
    kmeans_keywords = json.load(f)

In [5]:
df_plc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33130 entries, 0 to 33129
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query_id   33130 non-null  object
 1   eid        33130 non-null  object
 2   doi        33129 non-null  object
 3   title      33130 non-null  object
 4   abstract   33130 non-null  object
 5   clean_abs  33129 non-null  object
 6   phase      33130 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1.8+ MB


In [6]:
df_kmeans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33130 entries, 0 to 33129
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   query_id        33130 non-null  object 
 1   eid             33130 non-null  object 
 2   doi             33129 non-null  object 
 3   title           33130 non-null  object 
 4   abstract        33130 non-null  object 
 5   clean_abs       33129 non-null  object 
 6   umap_x          33130 non-null  float64
 7   umap_y          33130 non-null  float64
 8   kmeans_cluster  33130 non-null  int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 2.3+ MB


In [7]:
# Merge PLC phases and k-means clusters into one DataFrame

cols_plc = ["eid", "query_id", "doi", "title", "abstract", "clean_abs", "phase"]
cols_km = ["eid", "kmeans_cluster"]

df = (
    df_plc[cols_plc]
    .merge(df_kmeans[cols_km], on="eid", how="inner", validate="one_to_one")
)

print("Merged df shape:", df.shape)
df.head()

Merged df shape: (33130, 8)


Unnamed: 0,eid,query_id,doi,title,abstract,clean_abs,phase,kmeans_cluster
0,2-s2.0-105019728098,ml_end_of_life,10.1016/B978-0-443-33740-6.00012-8,Blockchain-enabled decision system for reliabl...,© 2026 Elsevier Inc. All rights reserved.As th...,All rights reserved.As the production and cons...,4,3
1,2-s2.0-105018918299,ml_end_of_life,10.1080/19397038.2025.2563271,Systematic review of data modelling methods fo...,© 2025 The Author(s). Published by Informa UK ...,"Published by Informa UK Limited, trading as Ta...",4,11
2,2-s2.0-105009033696,ml_end_of_life,10.1109/TEMSCON-ASPAC62480.2024.11025082,Clustering Locations of Collection Centers in ...,© 2024 IEEE.Waste generation is a significant ...,Waste generation is a significant issue in sus...,4,10
3,2-s2.0-85178021268,ml_end_of_life,10.1188/23.CJON.595-601,"Artificial Intelligence: Basics, Impact, and H...","© 2023, Oncology Nursing Society. All rights r...",All rights reserved.Applying artificial intell...,1,3
4,2-s2.0-85176777527,ml_end_of_life,10.1039/d3va00106g,Intersections between materials science and ma...,© 2023 RSCPlastics are an integral part of the...,"However, their widespread contamination in the...",4,1


In [9]:
# Distribution of numeric PLC phases and clusters

print("Phase counts:")
display(df["phase"].value_counts().sort_index().to_frame("count"))

print("\nK-means cluster counts:")
display(df["kmeans_cluster"].value_counts().sort_index().to_frame("count"))

Phase counts:


Unnamed: 0_level_0,count
phase,Unnamed: 1_level_1
1,2672
2,8892
3,15580
4,5986



K-means cluster counts:


Unnamed: 0_level_0,count
kmeans_cluster,Unnamed: 1_level_1
0,4340
1,3045
2,2328
3,2857
4,1045
5,2056
6,2280
7,1566
8,2057
9,2204


In [None]:
# Contingency table (phase vs cluster)

ct = pd.crosstab(df["phase"], df["kmeans_cluster"])
print("Counts (phase x cluster):")
display(ct)

print("\nRow-normalized (within phase):")
display(ct.div(ct.sum(axis=1), axis=0).round(3))

print("\nColumn-normalized (within cluster):")
display(ct.div(ct.sum(axis=0), axis=1).round(3))


Counts (phase x cluster):


kmeans_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12
phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,4,483,7,549,139,185,275,288,66,48,458,117,53
2,28,535,690,584,95,803,248,148,789,443,176,1740,2613
3,2847,1699,1434,1065,259,929,1202,924,791,1383,1113,1262,672
4,1461,328,197,659,552,139,555,206,411,330,576,348,224



Row-normalized (within phase):


kmeans_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12
phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.001,0.181,0.003,0.205,0.052,0.069,0.103,0.108,0.025,0.018,0.171,0.044,0.02
2,0.003,0.06,0.078,0.066,0.011,0.09,0.028,0.017,0.089,0.05,0.02,0.196,0.294
3,0.183,0.109,0.092,0.068,0.017,0.06,0.077,0.059,0.051,0.089,0.071,0.081,0.043
4,0.244,0.055,0.033,0.11,0.092,0.023,0.093,0.034,0.069,0.055,0.096,0.058,0.037



Column-normalized (within cluster):


kmeans_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12
phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.001,0.159,0.003,0.192,0.133,0.09,0.121,0.184,0.032,0.022,0.197,0.034,0.015
2,0.006,0.176,0.296,0.204,0.091,0.391,0.109,0.095,0.384,0.201,0.076,0.502,0.734
3,0.656,0.558,0.616,0.373,0.248,0.452,0.527,0.59,0.385,0.627,0.479,0.364,0.189
4,0.337,0.108,0.085,0.231,0.528,0.068,0.243,0.132,0.2,0.15,0.248,0.1,0.063


In [None]:
# For each cluster, find dominant phase and share inside cluster

cluster_summary = ct.stack().rename("count").reset_index()

idx = cluster_summary.groupby("kmeans_cluster")["count"].idxmax()
dominant = cluster_summary.loc[idx].copy()

total_per_cluster = ct.sum(axis=0)
dominant["total_cluster"] = dominant["kmeans_cluster"].map(total_per_cluster)
dominant["share_in_cluster"] = (
    dominant["count"] / dominant["total_cluster"]
)

dominant.sort_values("kmeans_cluster").reset_index(drop=True)


Unnamed: 0,phase,kmeans_cluster,count,total_cluster,share_in_cluster
0,3,0,2847,4340,0.655991
1,3,1,1699,3045,0.557964
2,3,2,1434,2328,0.615979
3,3,3,1065,2857,0.372769
4,4,4,552,1045,0.52823
5,3,5,929,2056,0.451848
6,3,6,1202,2280,0.527193
7,3,7,924,1566,0.590038
8,3,8,791,2057,0.384541
9,3,9,1383,2204,0.627495


In [None]:
# Similarity metrics between labeling schemes

phases = df["phase"].values
clusters = df["kmeans_cluster"].values

ari = adjusted_rand_score(phases, clusters)
nmi = normalized_mutual_info_score(phases, clusters)

print(f"Adjusted Rand Index (ARI): {ari:.3f}")
print(f"Normalized Mutual Information (NMI): {nmi:.3f}")


Adjusted Rand Index (ARI): 0.049
Normalized Mutual Information (NMI): 0.101
