In [None]:
import pandas as pd
import numpy as np
import umap.umap_ as umap
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import sklearn.cluster as cluster
from kneed import KneeLocator
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.metrics import silhouette_score

In [None]:
saeki_DOI = pd.read_csv('../data/Saeki_papers_doi.csv', encoding = "ISO-8859-1")['doi'].to_list()
saeki_DOI = [x for x in saeki_DOI if not pd.isna(x)]

def compare_with_saeki(doi):
    if doi in saeki_DOI:
        return 2
    else:
        return 5
    
def read_df_with_embedding(file):
    df = pd.read_csv(file, encoding = "ISO-8859-1")
    df = df.rename(columns={str(i): i for i in range(768)})
    for i in range(768):
        df[i] = df[i].astype(float)
    return df

In [None]:
query = '(photovoltaic polymer efficiency) | (polymer solar cell efficiency)'
search = read_df_with_embedding(f"../data/search_results_{query.replace(' ', '_').replace('|', 'or')}_with_embedding.csv")

search1 = search.iloc[:3500]
search2 = search.iloc[3500:]

search1.to_csv(f"../data/search_results_{query.replace(' ', '_').replace('|', 'or')}_with_embedding1.csv")
search2.to_csv(f"../data/search_results_{query.replace(' ', '_').replace('|', 'or')}_with_embedding2.csv")

# Assembling dataset

In [None]:
#assembling dataframe to be clustered
query = '(photovoltaic polymer efficiency) | (polymer solar cell efficiency)'


seeds = read_df_with_embedding("../data/seeds_Saeki_fullerene_OPV_with_abstract_and_embedding.csv")
#saeki = read_df_with_embedding("../data/fullerene_OPV_with_abstract_and_embedding.csv")
#search csv too big to upload as 1 file
search1 = read_df_with_embedding(f"../data/search_results_{query.replace(' ', '_').replace('|', 'or')}_with_embedding_bulk1.csv")
search2 = read_df_with_embedding(f"../data/search_results_{query.replace(' ', '_').replace('|', 'or')}_with_embedding_bulk2.csv")
search = pd.concat([search1,search2], ignore_index=True)
battery = read_df_with_embedding("../data/search_results_lithium_batteries_with_embedding_bulk.csv")
oled = read_df_with_embedding("../data/search_results_organic_photovoltaic_stability_with_embedding_bulk.csv")
electrolyte = read_df_with_embedding("../data/search_results_electrolyte_with_embedding_bulk.csv")



seeds['group'] = 2
#saeki['group'] = 1
search['group'] = 5
for i in range(0,len(search)):
    search.loc[i, 'group'] = compare_with_saeki(search['DOI'][i])

battery['group'] = 1
oled['group'] = 3
electrolyte['group'] = 4





#saeki_fullerene_opv = pd.concat([seeds,saeki,search,battery,oled,electrolyte])
saeki_fullerene_opv = pd.concat([seeds,electrolyte,oled,battery,search])
saeki_fullerene_opv = saeki_fullerene_opv.drop(saeki_fullerene_opv.columns[:7], axis=1)
saeki_fullerene_opv_targets = saeki_fullerene_opv.pop('group')

In [None]:
search

# UMAP dimensional reduction

In [None]:
standard_embedding = umap.UMAP(random_state=42).fit_transform(saeki_fullerene_opv)
plt.figure(figsize=(16,9))
cmap_name = 'jet'


plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=saeki_fullerene_opv_targets.astype(int), label=saeki_fullerene_opv_targets[1], s=5, cmap=cmap_name);
cmap = plt.get_cmap(cmap_name)
colors = [cmap(i/4) for i in range(5)]
#colors = [cmap(i/3) for i in range(4)]
for seed in range(0,5):
    plt.scatter(standard_embedding[seed, 0], standard_embedding[seed, 1], c='r', s=50, marker = '*')

legend_handles = [
    Line2D([0], [0], marker='o', color='w', label='Saekis dataset', markersize=10, markerfacecolor=colors[1]),
    Line2D([0], [0], marker='*', color='w', label='Seeds', markersize=10, markerfacecolor='r'),
    Line2D([0], [0], marker='o', color='w', label='Search Results', markersize=10, markerfacecolor=colors[4]),
    Line2D([0], [0], marker='o', color='w', label='Lithium Batteries', markersize=10, markerfacecolor=colors[0]),
    Line2D([0], [0], marker='o', color='w', label='OPV Stability and Degradation', markersize=10, markerfacecolor=colors[2]),
    Line2D([0], [0], marker='o', color='w', label='Electrolytes', markersize=10, markerfacecolor=colors[3])
]

# Add the custom legend to the plot
plt.legend(handles=legend_handles, title='Groups', loc='best')
plt.xlabel('Arbitrary Dimension x')
plt.ylabel('Arbitrary Dimension y')

2D UMAP visualization of the embeddings of papers from saeki's dataset compared with papers from different search results

# Clustering

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 8,
}

In [None]:
sse = []
for k in range(1, 31):
    clusterer = KMeans(n_clusters=k, **kmeans_kwargs)
    clusterer.fit(saeki_fullerene_opv)
    sse.append(clusterer.inertia_)

## Find the elbow point
kl = KneeLocator(
    range(1, 31), sse, curve="convex", direction="decreasing"
)

print("Optimal number of clusters based on elbow method: ", kl.elbow)

In [None]:
kmeans_labels = cluster.KMeans(n_clusters=kl.elbow, **kmeans_kwargs).fit_predict(saeki_fullerene_opv)
plt.figure(figsize=(16,9))

cmap = plt.get_cmap(cmap_name)
colors = [cmap(i/(kl.elbow - 1)) for i in range(kl.elbow)]

legend_handles = []

for i in range(kl.elbow):
    legend_handles.append(Line2D([0], [0], marker='o', color='w', label=f'Cluster {i+1}', markersize=10, markerfacecolor=colors[i]))

legend_handles.append(Line2D([0], [0], marker='*', color='w', label='Seeds', markersize=10, markerfacecolor='r'))
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=kmeans_labels, s=5, cmap=cmap_name);

for seed in range(0,5):
    plt.scatter(standard_embedding[seed, 0], standard_embedding[seed, 1], c='r', s=100, marker = '*')

plt.legend(handles=legend_handles, title='Clusters', loc='best')
plt.xlabel('Arbitrary Dimension x')
plt.ylabel('Arbitrary Dimension y')

2D UMAP visualization of the kmeans clustering performed on our dataset

# Analyzing results

In [None]:
for i in range(0,5):
    print(f"seed {i} is in cluster {kmeans_labels[i]+1}")

Based on this we have determined our best cluster. Note: When referring to the clusters in the following graphics, I called the first one cluster 1. If we want to switch to zero based numbering we can change index_offset in the next block of code.

In [None]:
best_cluster = 8

# Change to 0 for zero based indexing
index_offset = 1

best_cluster += -1 * index_offset


Finding the breakdown of clusters for each group of papers

In [None]:
targets = pd.DataFrame(saeki_fullerene_opv_targets)['group'].to_list()
saeki_clusters = [0] * kl.elbow
search_clusters = [0] * kl.elbow
lithium_clusters = [0] * kl.elbow
opv_stability_clusters = [0] * kl.elbow
electrolyte_clusters = [0] * kl.elbow

for i in range(5, len(targets)):
    group = kmeans_labels[i]
    if targets[i] == 2:
        saeki_clusters[group] += 1
    elif targets[i] == 5:
        search_clusters[group] += 1
    elif targets[i] == 1:
        lithium_clusters[group] += 1
    elif targets[i] == 3:
        opv_stability_clusters[group] += 1
    elif targets[i] == 4:
        electrolyte_clusters[group] += 1

In [None]:
saeki_clusters

In [None]:
Counter(kmeans_labels.tolist())

In [None]:
cmap = plt.get_cmap('tab20')
colors2 = [cmap(i/(kl.elbow - 1)) for i in range(kl.elbow)]

In [None]:
labels = list(range(index_offset,kl.elbow+index_offset))  # Labels for each segment
explode = [0] * kl.elbow
explode[best_cluster] = .05
# Create the pie chart
plt.figure(figsize=(10, 8))  # Optional: specify the size of the figure
wedges, texts, autotexts = plt.pie(saeki_clusters, explode=explode, autopct='%1.1f', startangle=140, colors = colors2, pctdistance=1.1)
plt.legend(labels, title="Categories", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
# Equal aspect ratio ensures that pie chart is a circle

for i, a in enumerate(autotexts):
    print(a)
    if float(a.get_text()) < 1:  # Show percentage only if slice is small
        a.set_text('')
    else:
        a.set_text(f'{a.get_text()}%')
plt.axis('equal')

# Show the plot
plt.title('Clusters of Saekis dataset', pad=50)
plt.show()

In [None]:

data =  [saeki_clusters[best_cluster],search_clusters[best_cluster],lithium_clusters[best_cluster],opv_stability_clusters[best_cluster],electrolyte_clusters[best_cluster]]
labels = ['Saeki','Polymer Based Solar Cell', 'Lithium Batteries', 'OPV Stability and Degradation', 'Electrolytes']  # Labels for each segment

#data =  [saeki_clusters[best_cluster],lithium_clusters[best_cluster],opv_stability_clusters[best_cluster],electrolyte_clusters[best_cluster]]
#labels = ['Saeki', 'Lithium Batteries', 'OPV Stability and Degradation', 'Electrolytes']  # Labels for each segment

# Create the pie chart
plt.figure(figsize=(10, 8))  # Optional: specify the size of the figure
wedges, texts, autotexts = plt.pie(data, autopct='%1.1f', pctdistance=1.1, startangle=140)
for i, a in enumerate(autotexts):
    print(a)
    if float(a.get_text()) < .1:  # Show percentage only if slice is small
        a.set_text('')
    else:
        a.set_text(f'{a.get_text()}%')

plt.legend(labels, title="Categories", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
# Equal aspect ratio ensures that pie chart is a circle
plt.axis('equal')

# Show the plot
plt.title(f'Cluster {best_cluster+index_offset}', pad=50)
plt.show()

In [None]:
# let's take a look at what silhouette analysis looks like:
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

X = saeki_fullerene_opv
n_clusters = kl.elbow
#kmeans = KMeans(n_clusters=n_clusters , **kmeans_kwargs)
#kmeans.fit(saeki_fullerene_opv)
cluster_labels = kmeans_labels
score = silhouette_score(saeki_fullerene_opv, cluster_labels)


# Create a subplot with 1 row and 2 columns
fig, (ax1) = plt.subplots(1, 1)
fig.set_size_inches(12, 7)

# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim(-0.1, .5)
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(saeki_fullerene_opv) + (n_clusters + 1) * 10])


# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(saeki_fullerene_opv, cluster_labels)
print("For n_clusters =", n_clusters,
    "The average silhouette_score is :", silhouette_avg)

# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(saeki_fullerene_opv, cluster_labels)

y_lower = 10
    
for i in range(n_clusters):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = \
        sample_silhouette_values[cluster_labels == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = cmap(float(i) / n_clusters)
    ax1.fill_betweenx(np.arange(y_lower, y_upper),
                      0, ith_cluster_silhouette_values,
                      facecolor=color, edgecolor=color, alpha=1)

    # Label the silhouette plots with their cluster numbers at the middle
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i+index_offset))

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

ax1.set_xlabel("Silhouette coefficient value")
ax1.set_ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

ax1.set_yticks([])  # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1,0, 0.1, 0.2, 0.3, 0.4, 0.5])


In [None]:
labels = list(range(index_offset,kl.elbow+index_offset))  # Labels for each segment
explode = [0] * kl.elbow
#explode[2] = .05
# Create the pie chart
plt.figure(figsize=(10, 8))  # Optional: specify the size of the figure
wedges, texts, autotexts = plt.pie(lithium_clusters, explode=explode, autopct='%1.1f', startangle=140, colors = colors2, pctdistance=1.1)
plt.legend(labels, title="Categories", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
# Equal aspect ratio ensures that pie chart is a circle
for i, a in enumerate(autotexts):
    print(a)
    if float(a.get_text()) < 1:  # Show percentage only if slice is small
        a.set_text('')
    else:
        a.set_text(f'{a.get_text()}%')
plt.axis('equal')

# Show the plot
plt.title('Clusters of Lithium Battery search results', pad=50)
plt.show()

In [None]:
clust = 9
clust += -1 * index_offset
data =  [saeki_clusters[clust],search_clusters[clust],lithium_clusters[clust],opv_stability_clusters[clust],electrolyte_clusters[clust]]
labels = ['Saeki','Polymer Based Solar Cell', 'Lithium Batteries', 'OPV Stability and Degradation', 'Electrolytes']  # Labels for each segment

#data =  [saeki_clusters[best_cluster],lithium_clusters[best_cluster],opv_stability_clusters[best_cluster],electrolyte_clusters[best_cluster]]
#labels = ['Saeki', 'Lithium Batteries', 'OPV Stability and Degradation', 'Electrolytes']  # Labels for each segment

# Create the pie chart
plt.figure(figsize=(10, 8))  # Optional: specify the size of the figure
wedges, texts, autotexts = plt.pie(data, autopct='%1.1f', pctdistance=1.1, startangle=140)
for i, a in enumerate(autotexts):
    print(a)
    if float(a.get_text()) < .1:  # Show percentage only if slice is small
        a.set_text('')
    else:
        a.set_text(f'{a.get_text()}%')

plt.legend(labels, title="Categories", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
# Equal aspect ratio ensures that pie chart is a circle
plt.axis('equal')

# Show the plot
plt.title(f'Cluster {clust+index_offset}', pad = 50)
plt.show()