In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import json
import os

In [2]:
! pip install sentence-transformers



In [3]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [10]:
# Some sample code
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

(3, 384)
tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])


In [10]:
dataframes = []
for filename in os.listdir('data/datasets'):
    if '.json' in filename:
        print(filename)
        with open('data/datasets/' + filename, 'r') as f:
            dataframes.append(pd.read_json(f))

tasks_1727996576.json
tasks_1727996587.json
tasks_1727996564.json
tasks_1727996676.json


In [30]:
embeddings = []
for df in dataframes:
    embed = []
    for i, row in df.iterrows():
        embed.append(e:=model.encode(row['tasks']))
        embeddings.append(e)
    df['embeddings'] = embed

In [31]:
dataframes[0].iloc[0]

tasks                                      Handle HTTP requests
length                                                      200
embeddings    [-0.054039054, 0.0792421, -0.009309811, -0.025...
Name: 0, dtype: object

In [15]:
def get_sim_mat(df):    
    return model.similarity(np.stack(list(df['embeddings']), axis=0), np.stack(list(df['embeddings']), axis=0))

In [111]:
sim_mat

tensor([[ 1.0000,  0.0220,  0.0607,  ...,  0.1256,  0.0592,  0.1402],
        [ 0.0220,  1.0000, -0.0605,  ...,  0.2119,  0.1009,  0.0472],
        [ 0.0607, -0.0605,  1.0000,  ...,  0.0644, -0.0418,  0.0435],
        ...,
        [ 0.1256,  0.2119,  0.0644,  ...,  1.0000,  0.3888,  0.1936],
        [ 0.0592,  0.1009, -0.0418,  ...,  0.3888,  1.0000,  0.1581],
        [ 0.1402,  0.0472,  0.0435,  ...,  0.1936,  0.1581,  1.0000]])

In [115]:
sim_mat[0,1]

tensor(0.0220)

In [14]:
def get_coords(sim_mat):
    match_indices = np.where(
        (sim_mat > 0.75) & (sim_mat < 1)
        )
    match_coords = list(zip(match_indices[0], match_indices[1]))
    match_coords = [(int(i), int(j)) for i, j in match_coords if i != j]

    final_coords = []
    for i, j in match_coords:
        if (j, i, sim_mat[i,j].item()) not in final_coords:
            final_coords.append((i, j, sim_mat[i, j].item()))

    return final_coords

In [121]:
len(final_coords)

288

In [100]:
df.to_csv('large_dataset.csv')

In [122]:
with open('data/coords_and_scores.json', 'w') as f:
    json.dump(final_coords, f)

In [5]:
with open('data/datasets/merged.json', 'r') as f:
    large_data = json.load(f)

In [8]:
df = pd.DataFrame(large_data, columns=['tasks'])

In [9]:
df.head()

Unnamed: 0,tasks
0,Handle HTTP requests
1,Encrypt and decrypt data
2,Calculate the derivative of a function
3,Parse JSON data
4,Read and write CSV files


In [12]:
embed = []
for i, row in df.iterrows():
    embed.append(e:=model.encode(row['tasks']))
df['embeddings'] = embed

In [13]:
df

Unnamed: 0,tasks,embeddings
0,Handle HTTP requests,"[-0.054039054, 0.0792421, -0.009309811, -0.025..."
1,Encrypt and decrypt data,"[0.0055927653, 0.07697791, -0.037826594, -5.61..."
2,Calculate the derivative of a function,"[0.0615652, 0.064298615, -0.025828376, -0.0069..."
3,Parse JSON data,"[0.013707104, 0.047950722, 0.050785195, -0.028..."
4,Read and write CSV files,"[0.03940622, 0.036223784, -0.11427616, 0.03644..."
...,...,...
2105,Develop real-time video processing models for ...,"[-0.017905205, -0.14254817, -0.06480117, -0.03..."
2106,Build a recommendation system for podcasts and...,"[-0.008535604, -0.10827821, -0.07162933, -0.04..."
2107,Create innovative AI frameworks for blockchain...,"[-0.020531366, 0.039624482, -0.05616963, -0.01..."
2108,Implement a focus mode in iOS apps using AI as...,"[-0.042403456, -0.078028865, -0.025398547, -0...."


In [16]:
sim_mat = get_sim_mat(df)

In [17]:
coords = get_coords(sim_mat)

In [20]:
coords

[(3, 764, 0.9999997615814209),
 (3, 1098, 0.9999997615814209),
 (4, 765, 0.9999998211860657),
 (6, 778, 0.9999998211860657),
 (7, 1604, 0.8798014521598816),
 (9, 1099, 0.9212795495986938),
 (11, 84, 0.8473995923995972),
 (11, 769, 0.9999998807907104),
 (11, 835, 0.8820209503173828),
 (11, 1112, 0.7919924259185791),
 (11, 1144, 0.8419245481491089),
 (11, 1440, 0.9033075571060181),
 (12, 1100, 0.9325422644615173),
 (13, 787, 0.8708759546279907),
 (13, 1096, 0.9999998807907104),
 (14, 805, 0.9999998807907104),
 (15, 24, 0.8090447187423706),
 (16, 828, 0.8939151763916016),
 (17, 837, 0.9846643209457397),
 (19, 79, 0.8601760268211365),
 (19, 777, 0.8552052974700928),
 (21, 1102, 0.9999998807907104),
 (22, 793, 0.7578144073486328),
 (22, 1106, 0.7578144073486328),
 (24, 116, 0.7654465436935425),
 (24, 234, 0.8054049015045166),
 (24, 791, 0.8278607130050659),
 (24, 855, 0.8090447187423706),
 (24, 1142, 0.8278607130050659),
 (26, 785, 0.842914342880249),
 (28, 852, 0.9160180687904358),
 (28, 1

In [21]:
with open('data/merged_coords.json', 'w') as f:
    json.dump(coords, f)

In [22]:
df.to_csv('merged_dataset.csv')