In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances_argmin
from tqdm import tqdm
from joblib import dump, load
import pickle
import csv
import itertools

tqdm.pandas()

In [2]:
def convert_str_to_array(cell):
    return np.fromstring(cell.strip("[]").replace("\n", ""), sep=" ")

In [3]:
reader = pd.read_csv('/kaggle/input/prompt-embeddings-v1/prompt_embeddings.csv',
                 index_col = 0, names = ['Prompt','Vector'],engine='python',sep='@')

reader=reader.iloc[1:,:]
reader.head(5)
temp = np.zeros((reader.shape[0], 768))
df_prompt = reader['Vector'].apply(convert_str_to_array).to_numpy()
for i in tqdm(range(reader.shape[0])):
    temp[i] = df_prompt[i].reshape(1, 768)
print(temp[0:4])
print(reader.head(5))

100%|██████████| 125731/125731 [00:00<00:00, 130345.21it/s]

[[-0.01440415 -0.01699878  0.02690694 ...  0.01901934 -0.07778067
   0.01167112]
 [ 0.02593896 -0.03736676 -0.00101076 ... -0.02097149 -0.04021124
   0.00421061]
 [-0.01078814  0.00612156 -0.01214769 ... -0.01073529 -0.03337857
  -0.01658971]
 [-0.02829935 -0.02144036  0.01140211 ... -0.04480722 -0.02880124
   0.01849946]]
                                                                           Vector
Prompt                                                                           
Compose this as a sonnet:       [-1.44041451e-02 -1.69987809e-02  2.69069374e-...
Convert this into a limerick:   [ 0.02593896 -0.03736676 -0.00101076  0.051690...
Present this as a news report:  [-0.01078814  0.00612156 -0.01214769  0.049632...
Rewrite this as a fairy tale:   [-2.82993522e-02 -2.14403607e-02  1.14021106e-...
Frame this as a diary entry:    [-1.01082930e-02 -1.66949276e-02 -1.01815648e-...





In [4]:
batch_size = 10000  # Adjust this based on your memory capacity
scaled_df = StandardScaler().fit_transform(temp)
X = np.array(scaled_df, dtype=float)

In [5]:
# # No need to run this in general, it calculates num_components in next cell
# pca = PCA()
# pca.fit(scaled_df)
# cumsum = np.cumsum(pca.explained_variance_ratio_)
# d = np.argmax(cumsum >= 0.95) + 1
# print(d)

In [6]:
num_components = 264
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(scaled_df)

In [7]:
# explained_variance = pca.explained_variance_ratio_
# print(f"Explained variance by each component: {explained_variance}")

In [8]:
# plt.bar(range(1, num_components + 1), explained_variance, alpha=0.7)
# plt.xlabel('Component Number')
# plt.ylabel('Explained Variance')
# plt.title('Scree Plot')
# plt.show()


In [9]:
# component_loadings = pd.DataFrame(pca.components_)
# print("Component Loadings:")
# print(component_loadings)

# dump(pca, 'pca.lib')

In [10]:
# # No need to run this every time
# sum_squared_dist = []

# for k in tqdm(range(2, 30)):  # Try different values of k
#     mbk = MiniBatchKMeans(init = 'k-means++', n_clusters = k,
#                      batch_size=batch_size, n_init=10,
#                      max_no_improvement=10, verbose=0)

#     mbk.fit(X_pca)
#     #kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
#     #kmeans.fit(scaled_df)
#     sum_squared_dist.append(mbk.inertia_)  # Inertia is the SSE
# # Initialize k-means parameters

# # Plot the elbow curve
# plt.plot(range(2, 30), sum_squared_dist, 'bx-')
# plt.xlabel('Number of Clusters (K)')
# plt.ylabel('Sum of Squared Distances')
# plt.title('Elbow Method for Optimal K')
# plt.show()

In [11]:
X_list = [row.tolist() for row in X_pca]
reader.insert(1, "Projection", X_list, True)
print(reader.head())

                                                                           Vector  \
Prompt                                                                              
Compose this as a sonnet:       [-1.44041451e-02 -1.69987809e-02  2.69069374e-...   
Convert this into a limerick:   [ 0.02593896 -0.03736676 -0.00101076  0.051690...   
Present this as a news report:  [-0.01078814  0.00612156 -0.01214769  0.049632...   
Rewrite this as a fairy tale:   [-2.82993522e-02 -2.14403607e-02  1.14021106e-...   
Frame this as a diary entry:    [-1.01082930e-02 -1.66949276e-02 -1.01815648e-...   

                                                                       Projection  
Prompt                                                                             
Compose this as a sonnet:       [9.332539705538476, -10.442916153970282, -7.09...  
Convert this into a limerick:   [12.673430075322667, -9.036025733807564, -5.57...  
Present this as a news report:  [1.0391959090676062, 4.3632952266432

In [12]:
# Initialize K-means with the chosen number of clusters (K=3)
mbk = MiniBatchKMeans(init = 'k-means++', n_clusters = 64,
                     batch_size=batch_size, n_init=10,
                     max_no_improvement=10, verbose=0)

mbk.fit(X_pca)

# Get cluster assignments for each data point
reader['cluster'] = mbk.labels_

filename = 'clusters.pkl'
with open(filename, 'wb') as f:
    pickle.dump(mbk, f)

In [13]:
# cluster_counts = reader['cluster'].value_counts()

# cluster_counts.plot(kind='bar')
# plt.xlabel('Cluster')
# plt.ylabel('Frequency')
# plt.title('Frequency of Clusters')
# plt.show()

In [14]:

for i in range(0,64):
    df_i = reader[reader['cluster']==i].drop(columns=['Vector', 'Projection'])
    df_i.to_csv(f"c{i}.csv",index=True)
    print(df_i.head())

                                                 cluster
Prompt                                                  
Adapt this into a comic strip scenario:                0
Transmute this into an absurdist play dialogue:        0
Translate this into modern slang:                      0
Forge this into a courtroom drama:                     0
Render this as a political satire:                     0
                                                    cluster
Prompt                                                     
Present this as an avant-garde poetry piece:              1
Present this text as an avant-garde poetry piec...        1
Rewrite this text with the poetic and introspec...        1
Detach yourself from the prescribed method and ...        1
Discard the suggested approach and let your poe...        1
                                                    cluster
Prompt                                                     
Alter this into a sci-fi narrative:                       2
N