In [1]:
import pandas as pd

df = pd.read_csv('parsed_smell_modified.csv', delimiter='\t', header=None, names=['AST'])
df

Unnamed: 0,AST
0,\n\t ClassDeclaration\n\t\t BaseList\n\t\t\t S...
1,\n\t\t FieldDeclaration\n\t\t\t VariableDeclar...
2,\n\t\t\t\t\t\t LocalDeclarationStatement\n\t\t...
3,\n\t ClassDeclaration\n\t\t BaseList\n\t\t\t S...
4,\n\t\t FieldDeclaration\n\t\t\t VariableDeclar...
5,\n\t ClassDeclaration\n\t\t BaseList\n\t\t\t S...
6,\n\t ClassDeclaration\n\t\t BaseList\n\t\t\t S...
7,\n\t ClassDeclaration\n\t\t BaseList\n\t\t\t S...
8,\n\t ClassDeclaration\n\t\t BaseList\n\t\t\t S...
9,\n\t\t MethodDeclaration\n\t\t\t PredefinedTyp...


In [2]:
ast_list = df['AST'].tolist()
preprocessed_ast_list = [ast.strip() for ast in ast_list]

In [3]:
from sentence_transformers import SentenceTransformer, util
import torch


In [4]:
model = SentenceTransformer('bert-base-nli-mean-tokens')


In [5]:
ast_embeddings = model.encode(preprocessed_ast_list, convert_to_tensor=True)
save_path = 'smell_embeddings.pt'
torch.save(ast_embeddings, save_path)


In [6]:
loaded_ast_embeddings = torch.load(save_path)
loaded_ast_embeddings 

tensor([[-0.0942,  0.5134,  0.9372,  ..., -0.3240, -0.3524,  0.4132],
        [ 0.0629, -0.4134,  1.3098,  ..., -0.0182, -0.4260,  0.3129],
        [-0.3730,  0.6687,  0.9115,  ..., -0.7725, -0.4094,  0.5863],
        ...,
        [-0.0528, -0.2218,  1.2738,  ..., -0.4359, -0.4130,  0.4544],
        [-0.0950,  0.6250,  0.5913,  ..., -0.2796, -0.2381,  0.4071],
        [-0.0950,  0.6250,  0.5913,  ..., -0.2796, -0.2381,  0.4071]])

In [7]:
num_embeddings = loaded_ast_embeddings.shape[0]
print(f"Number of saved embeddings: {num_embeddings}")

Number of saved embeddings: 12


In [8]:
# Example: Compute similarity between the first and second ASTs
similarity_score = util.cos_sim(ast_embeddings[6], ast_embeddings[5])
print(f"Similarity score between AST 1 and AST 2: {similarity_score}")

Similarity score between AST 1 and AST 2: tensor([[1.]])


In [9]:
import numpy as np

num_embeddings = len(ast_embeddings)

for ast_index in range(num_embeddings):
    similarity_scores = np.zeros(num_embeddings)

    for i in range(num_embeddings):
        if i != ast_index:
            similarity_scores[i] = util.cos_sim(ast_embeddings[ast_index], ast_embeddings[i])

    # Sort the similarity scores in descending order
    sorted_indices = np.argsort(similarity_scores)[::-1]

    # List the similarity scores for the specific AST in descending order
    print(f"Ast {ast_index}:")
    for idx in sorted_indices:
        if idx != ast_index:
            score = similarity_scores[idx]
            print(f"AST {idx}  {score:.2f}")
    print()

Ast 0:
AST 11  0.98
AST 10  0.98
AST 8  0.98
AST 7  0.98
AST 6  0.98
AST 5  0.98
AST 3  0.98
AST 9  0.81
AST 4  0.79
AST 1  0.79
AST 2  0.66

Ast 1:
AST 4  1.00
AST 9  0.83
AST 0  0.79
AST 11  0.74
AST 10  0.74
AST 8  0.74
AST 7  0.74
AST 6  0.74
AST 5  0.74
AST 3  0.74
AST 2  0.70

Ast 2:
AST 9  0.72
AST 4  0.70
AST 1  0.70
AST 0  0.66
AST 11  0.64
AST 10  0.64
AST 8  0.64
AST 7  0.64
AST 6  0.64
AST 5  0.64
AST 3  0.64

Ast 3:
AST 11  1.00
AST 10  1.00
AST 8  1.00
AST 7  1.00
AST 6  1.00
AST 5  1.00
AST 0  0.98
AST 9  0.80
AST 4  0.74
AST 1  0.74
AST 2  0.64

Ast 4:
AST 1  1.00
AST 9  0.83
AST 0  0.79
AST 11  0.74
AST 10  0.74
AST 8  0.74
AST 7  0.74
AST 6  0.74
AST 5  0.74
AST 3  0.74
AST 2  0.70

Ast 5:
AST 11  1.00
AST 10  1.00
AST 8  1.00
AST 7  1.00
AST 6  1.00
AST 3  1.00
AST 0  0.98
AST 9  0.80
AST 4  0.74
AST 1  0.74
AST 2  0.64

Ast 6:
AST 11  1.00
AST 10  1.00
AST 8  1.00
AST 7  1.00
AST 5  1.00
AST 3  1.00
AST 0  0.98
AST 9  0.80
AST 4  0.74
AST 1  0.74
AST 2  0.64

Ast 7: