In [32]:
import torch
import pickle
from sentence_transformers import util

# Load the embeddings
loaded_ast_embeddings = torch.load('parsed_smell_embeddings.pt')

# Load the message encoding vocabulary
with open('message_encoding_vocabulary.pkl', 'rb') as file:
    message_encoding_vocabulary = pickle.load(file)

# Get a subset of message types for testing
message_types = message_encoding_vocabulary[3:6]  # Change the range as needed

# Randomly select 100 examples for testing
example_indices = torch.randperm(loaded_ast_embeddings.shape[0])[:100]

# Perform cosine similarity accuracy test
correct_predictions = 0
for index in example_indices:
    embedding = loaded_ast_embeddings[index]
    similarity_scores = util.cos_sim(embedding, loaded_ast_embeddings)

    # Find the indices of top-k similar embeddings
    _, top_indices = torch.topk(similarity_scores, k=len(message_types)+1)

    # Check if the correct message type is among the top-k similar embeddings
    if index in top_indices:
        correct_predictions += 1

accuracy = correct_predictions / len(example_indices)
print(f"Accuracy: {accuracy}")

Accuracy: 0.63


In [34]:
# Load the embeddings
loaded_ast_embeddings = torch.load('parsed_smell_identifier_embeddings.pt')

# Load the message encoding vocabulary
with open('message_encoding_vocabulary.pkl', 'rb') as file:
    message_encoding_vocabulary = pickle.load(file)

# Get a subset of message types for testing
message_types = message_encoding_vocabulary[3:6]  # Change the range as needed

# Randomly select 100 examples for testing
example_indices = torch.randperm(loaded_ast_embeddings.shape[0])[:100]

# Perform cosine similarity accuracy test
correct_predictions = 0
for index in example_indices:
    embedding = loaded_ast_embeddings[index]
    similarity_scores = util.cos_sim(embedding, loaded_ast_embeddings)

    # Find the indices of top-k similar embeddings
    _, top_indices = torch.topk(similarity_scores, k=len(message_types)+1)

    # Check if the correct message type is among the top-k similar embeddings
    if index in top_indices:
        correct_predictions += 1

accuracy = correct_predictions / len(example_indices)
print(f"Accuracy: {accuracy}")

Accuracy: 0.94


In [40]:
import torch
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics

# Load the message encoding vocabulary
with open('message_encoding_vocabulary.pkl', 'rb') as file:
    message_encoding_vocabulary = pickle.load(file)

# Get a subset of message types for testing
message_types = message_encoding_vocabulary[3:6]  # Change the range as needed

# Load the embeddings
loaded_ast_embeddings = torch.load('parsed_smell_identifier_embeddings.pt')

# Convert the message encoding vocabulary to a list
message_encoding_vocabulary = message_encoding_vocabulary.tolist()

# Get the indices of the selected message types in the vocabulary
message_type_indices = np.where(np.isin(message_encoding_vocabulary, message_types))[0]

# Select the embeddings for the selected message types
selected_embeddings = loaded_ast_embeddings[message_type_indices]

# Select a subset of the data (100 examples)
subset_selected_embeddings = selected_embeddings[:100]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(subset_selected_embeddings, message_type_indices[:100],
                                                    test_size=0.30, random_state=1)

# Create Decision Tree classifier object
clf_dt = DecisionTreeClassifier()

# Train Decision Tree classifier
clf_dt.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred_dt = clf_dt.predict(X_test)

print("Accuracy Decision Tree:", metrics.accuracy_score(y_test, y_pred_dt))

# Create SVM classifier object
clf_svm = SVC(kernel='poly', degree=15, C=1.0)

# Train SVM classifier
clf_svm.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred_svm = clf_svm.predict(X_test)

print("Accuracy SVM:", metrics.accuracy_score(y_test, y_pred_svm))

Accuracy Decision Tree: 0.0
Accuracy SVM: 0.0
