In [None]:
import pandas as pd
import numpy as np
import umap.umap_ as umap
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import sklearn.cluster as cluster
from kneed import KneeLocator
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.metrics import silhouette_score

# Get the plots for embedding with all seads, search, battery and OPV stability

In [None]:
saeki_fullerene_opv = pd.read_csv("../data/all_paper_with_abstracts.csv", encoding = "ISO-8859-1")
saeki_fullerene_opv_embeddings = np.load("../data/all_paper_with_abstracts_embeddings.npy")
saeki_fullerene_opv_targets = saeki_fullerene_opv['group']

In [None]:
standard_embedding = umap.UMAP(random_state=42).fit_transform(saeki_fullerene_opv_embeddings)
standard_embedding_df = pd.DataFrame(standard_embedding, columns=['Arbitrary Dimension x', 'Arbitrary Dimension y'])

In [None]:
plt.figure(figsize=(16,9))

color_map = ['C0', 'C1', 'C2', 'C0']

for i in range(4):
    embedding_i = standard_embedding_df[saeki_fullerene_opv_targets == i].to_numpy()
    plt.scatter(embedding_i[:, 0], embedding_i[:, 1], c=color_map[i], label=i, s=5, alpha=1)

legend_handles = [
    Line2D([0], [0], marker='o', color='w', label='(photovoltaic polymer) | (polymer solar cell)', markersize=10, markerfacecolor='C0'),
    Line2D([0], [0], marker='o', color='w', label='OPV Stability and Degradation', markersize=10, markerfacecolor='C1'),
    Line2D([0], [0], marker='o', color='w', label='Lithium Batteries', markersize=10, markerfacecolor='C2'),
]

# Add the custom legend to the plot
plt.legend(handles=legend_handles, title='Search Key Words', title_fontsize=17, fontsize=17)
plt.xlabel('Arbitrary Dimension x', fontsize=20)
plt.ylabel('Arbitrary Dimension y', fontsize=20)
plt.tick_params(axis='both', labelsize=20)
plt.xlim([-5.5, 12.5])
plt.ylim([-5, 5.125])

plt.savefig('../plots/Umap_all_plots_randomstate_42_azure_embedding.png', bbox_inches='tight')

# Assembling clustering dataset

In [None]:
counts = saeki_fullerene_opv['group'].value_counts()
counts

In [None]:
len_search_and_seeds = counts[0] + counts[3] + counts[5]
saeki_fullerene_opv = saeki_fullerene_opv[:len_search_and_seeds]
saeki_fullerene_opv_embeddings = saeki_fullerene_opv_embeddings[:len_search_and_seeds]
saeki_fullerene_opv_targets = saeki_fullerene_opv['group']

In [None]:
saeki_fullerene_opv['group'].value_counts(), saeki_fullerene_opv_embeddings.shape

In [None]:
standard_embedding = umap.UMAP(random_state=40).fit_transform(saeki_fullerene_opv_embeddings)
standard_embedding_df = pd.DataFrame(standard_embedding, columns=['Arbitrary Dimension x', 'Arbitrary Dimension y'])

In [None]:
plt.figure(figsize=(16,9))

for i in (0, 3):
    embedding_i = standard_embedding_df[saeki_fullerene_opv_targets == i].to_numpy()
    plt.scatter(embedding_i[:, 0], embedding_i[:, 1], c=f"C{i}", label=i, s=5, alpha=1)

for seed in range(0,5):
    plt.scatter(standard_embedding[seed, 0], standard_embedding[seed, 1], c='k', s=100, marker = '*')

legend_handles = [
    Line2D([0], [0], marker='*', color='w', label='Seeds', markersize=15, markerfacecolor='k'),
    Line2D([0], [0], marker='o', color='w', label='Saeki papers in search results', markersize=10, markerfacecolor='C3'),
    Line2D([0], [0], marker='o', color='w', label='non-Saeki papers in search results', markersize=10, markerfacecolor='C0'),
]

# Add the custom legend to the plot
plt.legend(handles=legend_handles, loc='best', fontsize=17)
plt.xlabel('Arbitrary Dimension x', fontsize=20)
plt.ylabel('Arbitrary Dimension y', fontsize=20)
plt.tick_params(axis='both', labelsize=20)

plt.savefig('../plots/Umap_plots_randomstate_93_azure_embedding.png', bbox_inches='tight')

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 40,
}

In [None]:
sse = []
for k in range(1, 41):
    print(k)
    clusterer = KMeans(n_clusters=k, **kmeans_kwargs)
    clusterer.fit(saeki_fullerene_opv_embeddings)
    sse.append(clusterer.inertia_)

## Find the elbow point
kl = KneeLocator(
    range(1, 41), sse, curve="convex", direction="decreasing"
)

print("Optimal number of clusters based on elbow method: ", kl.elbow)

In [None]:
kmeans_labels = cluster.KMeans(n_clusters=kl.elbow, **kmeans_kwargs).fit_predict(saeki_fullerene_opv_embeddings)
plt.figure(figsize=(16,9))
cmap_name = 'tab20'
cmap = plt.get_cmap(cmap_name)
colors = [cmap(i/(kl.elbow - 1)) for i in range(kl.elbow)]

legend_handles = []

for i in range(kl.elbow):
    legend_handles.append(Line2D([0], [0], marker='o', color='w', label=f'Cluster {i+1}', markersize=10, markerfacecolor=colors[i]))

legend_handles.append(Line2D([0], [0], marker='*', color='w', label='Seeds', markersize=15, markerfacecolor='k'))
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=kmeans_labels, s=5, cmap=cmap_name, alpha=0.75);

for seed in range(0,5):
    plt.scatter(standard_embedding[seed, 0], standard_embedding[seed, 1], c='k', s=100, marker = '*')

plt.legend(handles=legend_handles, loc='best', fontsize=17)
plt.xlabel('Arbitrary Dimension x', fontsize=20)
plt.ylabel('Arbitrary Dimension y', fontsize=20)
plt.tick_params(axis='both', labelsize=20)
plt.savefig('../plots/clustering_plots_randomstate_40_azure_embedding.png', bbox_inches='tight')

In [None]:
for i in range(0,5):
    print(f"seed {i} is in cluster {kmeans_labels[i]+1}")

Based on this we have determined our best cluster. Note: When referring to the clusters in the following graphics, I called the first one cluster 1. If we want to switch to zero based numbering we can change index_offset in the next block of code.

In [None]:
best_cluster = 9

# Change to 0 for zero based indexing
index_offset = 1

best_cluster += -1 * index_offset


Finding the breakdown of clusters for each group of papers

In [None]:
targets = pd.DataFrame(saeki_fullerene_opv_targets)['group'].to_list()
saeki_clusters = [0] * kl.elbow
search_clusters = [0] * kl.elbow

for i in range(5, len(targets)):
    group = kmeans_labels[i]
    if targets[i] == 3:
        saeki_clusters[group] += 1
    elif targets[i] == 0:
        search_clusters[group] += 1

In [None]:
saeki_clusters, saeki_clusters[best_cluster]

In [None]:
kmeans_labels_dict = Counter(kmeans_labels.tolist())
kmeans_labels_dict, kmeans_labels_dict[best_cluster]

In [None]:
cmap = plt.get_cmap('tab20')
colors2 = [cmap(i/(kl.elbow - 1)) for i in range(kl.elbow)]

In [None]:
labels = list(range(index_offset,kl.elbow+index_offset))  # Labels for each segment
explode = [0] * kl.elbow
explode[best_cluster] = .05
# Create the pie chart
plt.figure(figsize=(10, 8))  # Optional: specify the size of the figure
wedges, texts, autotexts = plt.pie(saeki_clusters, explode=explode, autopct='%1.1f', startangle=140, colors = colors2, pctdistance=1.1)
plt.legend(labels, title="Clusters", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
# Equal aspect ratio ensures that pie chart is a circle

for i, a in enumerate(autotexts):
    print(a)
    if float(a.get_text()) < 1:  # Show percentage only if slice is small
        a.set_text('')
    else:
        a.set_text(f'{a.get_text()}%')
plt.axis('equal')

# Show the plot
plt.title('The Cluster Distribution of Saeki\'s Papers')
plt.savefig('../plots/clustering_plots_randomstate_40_cluster_of_saeki_azure_embedding.png', bbox_inches='tight')
plt.show()

In [None]:
# let's take a look at what silhouette analysis looks like:
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

X = saeki_fullerene_opv_embeddings
n_clusters = kl.elbow
#kmeans = KMeans(n_clusters=n_clusters , **kmeans_kwargs)
#kmeans.fit(saeki_fullerene_opv)
cluster_labels = kmeans_labels
score = silhouette_score(saeki_fullerene_opv_embeddings, cluster_labels)


# Create a subplot with 1 row and 2 columns
fig, (ax1) = plt.subplots(1, 1)
fig.set_size_inches(12, 7)

# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim(-0.1, .5)
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(saeki_fullerene_opv_embeddings) + (n_clusters + 1) * 10])


# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(saeki_fullerene_opv_embeddings, cluster_labels)
print("For n_clusters =", n_clusters,
    "The average silhouette_score is :", silhouette_avg)

# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(saeki_fullerene_opv_embeddings, cluster_labels)

y_lower = 10
    
for i in range(n_clusters):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = \
        sample_silhouette_values[cluster_labels == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = cmap(float(i) / (n_clusters - 1))
    ax1.fill_betweenx(np.arange(y_lower, y_upper),
                      0, ith_cluster_silhouette_values,
                      facecolor=color, edgecolor=color, alpha=1)

    # Label the silhouette plots with their cluster numbers at the middle
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i+index_offset))

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

ax1.set_xlabel("Silhouette coefficient value")
ax1.set_ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

ax1.set_yticks([])  # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1,0, 0.1, 0.2, 0.3, 0.4, 0.5])
fig.savefig('../plots/clustering_plots_randomstate_40_silhouette_score_azure_embedding.png', bbox_inches='tight')

# Export Data

In [None]:
saeki_fullerene_opv_save_df = saeki_fullerene_opv.drop(saeki_fullerene_opv.columns[:1], axis=1)
saeki_fullerene_opv_save_df = pd.concat([saeki_fullerene_opv_save_df, standard_embedding_df], axis=1)
saeki_fullerene_opv_save_df = pd.concat([saeki_fullerene_opv_save_df, pd.DataFrame(kmeans_labels, columns=['kmean_label'])], axis=1)
saeki_fullerene_opv_save_df.to_csv("../data/search_clustering_results_randomstate_40_azure_embedding.csv", encoding = "ISO-8859-1")

# Random State Experiments

In [None]:
opt_number_clusters = []
chosen_saeki = []
chosen_search = []

In [None]:
for randS in range(100):
    print("Expriment with random state = ", randS)
    standard_embedding = umap.UMAP(random_state=randS).fit_transform(saeki_fullerene_opv_embeddings)
    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": randS,
    }
    sse = []
    for k in range(1, 41):
        clusterer = KMeans(n_clusters=k, **kmeans_kwargs)
        clusterer.fit(saeki_fullerene_opv_embeddings)
        sse.append(clusterer.inertia_)

    ## Find the elbow point
    kl = KneeLocator(
        range(1, 41), sse, curve="convex", direction="decreasing"
    )
    print("  Optimal number of clusters based on elbow method: ", kl.elbow)
    opt_number_clusters.append(kl.elbow)

    kmeans_labels = cluster.KMeans(n_clusters=kl.elbow, **kmeans_kwargs).fit_predict(saeki_fullerene_opv_embeddings)

    chosen_clusters = set()
    for i in range(0,5):
        chosen_clusters.add(kmeans_labels[i])
        print(f"  seed {i} is in cluster {kmeans_labels[i]+1}")

    targets = pd.DataFrame(saeki_fullerene_opv_targets)['group'].to_list()
    saeki_clusters = [0] * kl.elbow
    search_clusters = [0] * kl.elbow

    for i in range(5, len(targets)):
        group = kmeans_labels[i]
        if targets[i] == 3:
            saeki_clusters[group] += 1
        elif targets[i] == 0:
            search_clusters[group] += 1

    chosen_saeki.append(sum([saeki_clusters[i] for i in chosen_clusters]))
    chosen_search.append(sum([search_clusters[i] for i in chosen_clusters]))
    print(f"  chosed {chosen_saeki[-1]} saeki, {chosen_search[-1]} non saeki in search")

In [None]:
parsed_results = []

splitted_result = Random_Test_results.split('\n')
i = 1
while i + 9 < len(splitted_result):
    choosed_paper = splitted_result[i+9].split(' ')
    parsed_results.append({
        "optimal number of clusters": int(splitted_result[i+3].split(' ')[-1]),
        "seed number of clusters": len(set([splitted_result[j].split(' ')[-1] for j in range(i+4, i+9)])),
        "chosed saeki papers": int(choosed_paper[3]),
        "chosed non saeki papers": int(choosed_paper[5]),
    })
    i+=10

In [None]:
len(parsed_results)

In [None]:
optimal_1 = []
optimal_2 = []
chosed_saeki_1 = []
chosed_saeki_2 = []
chosed_all_1 = []
chosed_all_2 = []

for result in parsed_results:
    if result['seed number of clusters'] == 1:
        optimal_1.append(result['optimal number of clusters'])
        chosed_saeki_1.append(result['chosed saeki papers'])
        chosed_all_1.append(result['chosed saeki papers'] + result['chosed non saeki papers'])
    elif result['seed number of clusters'] == 2:
        optimal_2.append(result['optimal number of clusters'])
        chosed_saeki_2.append(result['chosed saeki papers'])
        chosed_all_2.append(result['chosed saeki papers'] + result['chosed non saeki papers'])
    else:
        print(result['seed number of clusters'])

In [None]:
len(optimal_1), np.average(optimal_1), np.std(optimal_1)

In [None]:
np.average(chosed_saeki_1), np.std(chosed_saeki_1)

In [None]:
np.average(chosed_all_1), np.std(chosed_all_1)

In [None]:
len(optimal_2), np.average(optimal_2), np.std(optimal_2)

In [None]:
np.average(chosed_saeki_2), np.std(chosed_saeki_2)

In [None]:
np.average(chosed_all_2), np.std(chosed_all_2)

In [None]:
np.average(chosed_saeki_1 + chosed_saeki_2), np.std(chosed_saeki_1 + chosed_saeki_2)

In [None]:
np.average(chosed_all_1 + chosed_all_2), np.std(chosed_all_1 + chosed_all_2)

# plot open ai cluster

In [None]:
df = pd.read_csv("../data/search_clustering_results_randomstate_42_azure_embedding_with_openai_screening.csv", encoding = "ISO-8859-1")

In [None]:
labels = df['r3']

In [None]:
plt.figure(figsize=(16,9))
cmap_name = 'RdBu_r'
cmap = plt.get_cmap(cmap_name)
colors = [cmap(i/(kl.elbow - 1)) for i in range(kl.elbow)]

legend_handles = []

# for i in range(kl.elbow):
#     legend_handles.append(Line2D([0], [0], marker='o', color='w', label=f'Cluster {i+1}', markersize=10, markerfacecolor=colors[i]))

legend_handles.append(Line2D([0], [0], marker='*', color='w', label='Seeds', markersize=15, markerfacecolor='k'))
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=labels, s=5, cmap=cmap_name, alpha=0.75);
cbar = plt.colorbar(location='right', pad=-0.08, shrink=0.8, label='relevance score')
cbar.ax.set_yticklabels(range(0, 11, 2), fontsize=17)
cbar.set_label('relevance score', fontsize=17)

for seed in range(0,5):
    plt.scatter(standard_embedding[seed, 0], standard_embedding[seed, 1], c='k', s=100, marker = '*')

plt.legend(handles=legend_handles, loc='best', fontsize=17)
plt.xlabel('Arbitrary Dimension x', fontsize=20)
plt.ylabel('Arbitrary Dimension y', fontsize=20)
plt.tick_params(axis='both', labelsize=20)
plt.savefig('../plots/clustering_plots_randomstate_40_azure_embedding_openai.png', bbox_inches='tight')

# plot comperison of both methods

In [None]:
kmeans_labels[i]

In [None]:
labels[i] > 7

In [None]:
comparison_labels = []
for i in range(kmeans_labels.size):
    if kmeans_labels[i] == 8 and labels[i] >= 7:
        comparison_labels.append(3)
    elif kmeans_labels[i] == 8:
        comparison_labels.append(1)
    elif labels[i] >= 7:
        comparison_labels.append(2)
    else:
        comparison_labels.append(0)
comparison_labels = np.array(comparison_labels)

In [None]:
plt.figure(figsize=(16,12))

colormap = ['C0', 'C8', 'C2', 'C3']

for i in range(0, 4):
    embedding_i = standard_embedding_df[comparison_labels == i].to_numpy()
    plt.scatter(embedding_i[:, 0], embedding_i[:, 1], c=colormap[i], label=i, s=5, alpha=1)


embedding_i = standard_embedding_df.loc[:4, :].to_numpy()
plt.scatter(embedding_i[:, 0], embedding_i[:, 1], c="k", s=100, marker = '*')

legend_handles = [
    Line2D([0], [0], marker='*', color='w', label='Seeds', markersize=15, markerfacecolor='k'),
    Line2D([0], [0], marker='o', color='w', label='In both refined lists of LLM-UMap-kNN and direct LLM', markersize=10, markerfacecolor=colormap[3]),
    Line2D([0], [0], marker='o', color='w', label='Only in the refined list of LLM-UMap-kNN', markersize=10, markerfacecolor=colormap[1]),
    Line2D([0], [0], marker='o', color='w', label='Only in the refined list of direct LLM', markersize=10, markerfacecolor=colormap[2]),
    Line2D([0], [0], marker='o', color='w', label='In neither refined lists of LLM-UMap-kNN and direct LLM', markersize=10, markerfacecolor=colormap[0]),
]

# Add the custom legend to the plot
plt.legend(handles=legend_handles, loc='lower center', fontsize=17)
plt.xlabel('Arbitrary Dimension x', fontsize=20)
plt.ylabel('Arbitrary Dimension y', fontsize=20)
plt.tick_params(axis='both', labelsize=20)
plt.ylim([-5, 12])

plt.savefig('../plots/LLM_UMap_kNN_LLM_comprison.png', bbox_inches='tight')

In [None]:
pd.DataFrame(comparison_labels).value_counts()

In [None]:
df['kmean_label'] = kmeans_labels

In [None]:
df['comparison_label'] = comparison_labels

In [None]:
df['Arbitrary Dimension y'] = standard_embedding_df['Arbitrary Dimension y']

In [None]:
df.to_csv("../data/search_clustering_results_randomstate_40_azure_embedding_with_openai_screening.csv", encoding = "ISO-8859-1")

In [None]:
df = pd.read_csv("../data/search_clustering_results_randomstate_40_azure_embedding_with_openai_screening.csv", encoding = "ISO-8859-1")

In [None]:
standard_embedding_df = df[['Arbitrary Dimension x', 'Arbitrary Dimension y']]

In [None]:
comparison_labels = df['comparison_label']

In [None]:
df.head(10)

In [None]:
standard_embedding_df.loc[:4, :]