In [38]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import json
import os

from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

from sklearn.cluster import KMeans

In [37]:
! pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
! pip install sentence-transformers

In [23]:
# Some sample code
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

(3, 384)
tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])


In [10]:
dataframes = []
for filename in os.listdir('data/datasets'):
    if '.json' in filename:
        print(filename)
        with open('data/datasets/' + filename, 'r') as f:
            dataframes.append(pd.read_json(f))

tasks_1727996576.json
tasks_1727996587.json
tasks_1727996564.json
tasks_1727996676.json


In [30]:
embeddings = []
for df in dataframes:
    embed = []
    for i, row in df.iterrows():
        embed.append(e:=model.encode(row['tasks']))
        embeddings.append(e)
    df['embeddings'] = embed

In [31]:
dataframes[0].iloc[0]

tasks                                      Handle HTTP requests
length                                                      200
embeddings    [-0.054039054, 0.0792421, -0.009309811, -0.025...
Name: 0, dtype: object

In [94]:
sim_mat = model.similarity(np.stack(list(df['embeddings']), axis=0), np.stack(list(df['embeddings']), axis=0))

In [111]:
sim_mat

tensor([[ 1.0000,  0.0220,  0.0607,  ...,  0.1256,  0.0592,  0.1402],
        [ 0.0220,  1.0000, -0.0605,  ...,  0.2119,  0.1009,  0.0472],
        [ 0.0607, -0.0605,  1.0000,  ...,  0.0644, -0.0418,  0.0435],
        ...,
        [ 0.1256,  0.2119,  0.0644,  ...,  1.0000,  0.3888,  0.1936],
        [ 0.0592,  0.1009, -0.0418,  ...,  0.3888,  1.0000,  0.1581],
        [ 0.1402,  0.0472,  0.0435,  ...,  0.1936,  0.1581,  1.0000]])

In [115]:
sim_mat[0,1]

tensor(0.0220)

In [119]:
match_indices = np.where(
    (sim_mat > 0.75) & (sim_mat < 1)
    )
match_coords = list(zip(match_indices[0], match_indices[1]))
match_coords = [(int(i), int(j)) for i, j in match_coords if i != j]

final_coords = []
for i, j in match_coords:
    if (j, i, sim_mat[i,j].item()) not in final_coords:
        final_coords.append((i, j, sim_mat[i, j].item()))

In [121]:
len(final_coords)

288

In [100]:
df.to_csv('large_dataset.csv')

In [122]:
with open('data/coords_and_scores.json', 'w') as f:
    json.dump(final_coords, f)