In [93]:
import pandas as pd
import random

# Read the original CSV file
df = pd.read_csv('test-data_encoded.csv')

# Select random 5 ASTs from the 'parsed_smell' column
random_ast_indices = random.sample(range(len(df)), 5)
random_asts = df.loc[random_ast_indices, 'parsed_smell']

# Create a new DataFrame with the selected ASTs
input_df = pd.DataFrame({'AST': random_asts})

# Save the DataFrame to 'input.csv'
input_df.to_csv('input.csv', index=False)

# Print the contents of the 'input.csv' file with tabs and spaces represented
with open('input.csv', 'r') as file:
    content = file.readlines()
    for line in content:
        print(repr(line))

'AST\n'
'"\n'
'\t\t FieldDeclaration\n'
'\t\t\t VariableDeclaration\n'
'\t\t\t\t IdentifierName\n'
'\t\t\t\t VariableDeclarator\n'
'\t\t\t\t\t EqualsValueClause\n'
'\t\t\t\t\t\t ObjectCreationExpression\n'
'\t\t\t\t\t\t\t IdentifierName"\n'
'"\n'
'\t\t FieldDeclaration\n'
'\t\t\t VariableDeclaration\n'
'\t\t\t\t IdentifierName\n'
'\t\t\t\t VariableDeclarator\n'
'\t\t\t\t\t EqualsValueClause\n'
'\t\t\t\t\t\t ObjectCreationExpression\n'
'\t\t\t\t\t\t\t IdentifierName"\n'
'"\n'
'\t\t FieldDeclaration\n'
'\t\t\t VariableDeclaration\n'
'\t\t\t\t PredefinedType\n'
'\t\t\t\t VariableDeclarator"\n'
'"\n'
'\t\t MethodDeclaration\n'
'\t\t\t PredefinedType\n'
'\t\t\t ParameterList\n'
'\t\t\t\t Parameter\n'
'\t\t\t\t\t IdentifierName\n'
'\t\t\t ArrowExpressionClause\n'
'\t\t\t\t InvocationExpression\n'
'\t\t\t\t\t IdentifierName\n'
'\t\t\t\t\t ArgumentList\n'
'\t\t\t\t\t\t Argument\n'
'\t\t\t\t\t\t\t IdentifierName"\n'
'"\n'
'\t ClassDeclaration\n'
'\t\t BaseList\n'
'\t\t\t SimpleBaseType\n'
'\t\t

In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import pickle

# Load the embeddings from parsed_smell_identifier_embeddings.pt
embeddings = torch.load('parsed_smell_identifier_embeddings.pt')

# Load the encoded messages from messagelist.pkl
with open('messagelist.pkl', 'rb') as f:
    message_list = pickle.load(f)

# Load the message encoding vocabulary
with open('message_encoding_vocabulary.pkl', 'rb') as f:
    message_encoding_vocabulary = pickle.load(f)
    

# Convert the embeddings into a suitable format for comparison
existing_embeddings = np.array(embeddings.tolist())

# Create a DataFrame with the embeddings and encoded messages
data = pd.DataFrame({
    'parsed_smell_identifier_embeddings': existing_embeddings.tolist(),
    'message_encoded': message_list
})

# Prepare the data for the neural network
X = np.array(data['parsed_smell_identifier_embeddings'].tolist())
y = np.array(data['message_encoded'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train).float()
y_train_tensor = torch.tensor(y_train)
X_test_tensor = torch.tensor(X_test).float()
y_test_tensor = torch.tensor(y_test)

# Define the neural network architecture
class FeedforwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define the model parameters
input_dim = X_train_tensor.shape[1]
hidden_dim = 128
output_dim = len(np.unique(y_train))

# Create the neural network model
model = FeedforwardNN(input_dim, hidden_dim, output_dim)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Create DataLoader objects
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# Training the neural network model
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the neural network model
with torch.no_grad():
    model.eval()
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs.data, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_tensor)

print(f"Accuracy: {accuracy:.2f}")

# Load the new ASTs from the CSV file
new_data = pd.read_csv('input.csv')

# Preprocess the new ASTs
new_ast_list = new_data['AST'].tolist()
preprocessed_ast_list = [ast.strip() for ast in new_ast_list]

# Convert the new ASTs into embeddings using the same sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
new_ast_embeddings = model.encode(preprocessed_ast_list, convert_to_tensor=True)

# Classify the new ASTs based on similarity to existing embeddings
for i, new_embedding in enumerate(new_ast_embeddings):
    similarities = cosine_similarity([new_embedding.numpy()], existing_embeddings)
    max_similarity = np.max(similarities)
    predicted_label = y[np.argmax(similarities)]
    
    if max_similarity >= 0.9:
        print(f"AST {i+1} belongs to message type: {predicted_label}")
    else:
        print(f"AST {i+1} does not belong to any known message type.")
        
# Compare the new AST embeddings with the existing embeddings
for i, embedding in enumerate(new_ast_embeddings):
    similarities = cosine_similarity([embedding.numpy()], X)
    max_similarity_index = np.argmax(similarities)
    max_similarity = np.max(similarities)
    predicted_message = message_encoding_vocabulary[y[max_similarity_index]]
    print(f"AST {i+1} belongs to smell type: {predicted_message} with similarity {max_similarity:.2f}")

Accuracy: 0.83
AST 1 belongs to message type: 2
AST 2 belongs to message type: 2
AST 3 belongs to message type: 0
AST 4 belongs to message type: 4
AST 5 belongs to message type: 2
AST 1 belongs to smell type: Refactor with similarity 0.96
AST 2 belongs to smell type: Refactor with similarity 0.96
AST 3 belongs to smell type: Adjustment with similarity 1.00
AST 4 belongs to smell type: Rename with similarity 0.98
AST 5 belongs to smell type: Refactor with similarity 0.98


In [95]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load the embeddings from parsed_smell_identifier_embeddings.pt
embeddings = torch.load('parsed_smell_identifier_embeddings.pt')

# Load the encoded messages from messagelist.pkl
with open('messagelist.pkl', 'rb') as f:
    message_list = pickle.load(f)

# Convert the embeddings into a suitable format for comparison
existing_embeddings = np.array(embeddings.tolist())

# Create a DataFrame with the embeddings and encoded messages
data = pd.DataFrame({
    'parsed_smell_identifier_embeddings': existing_embeddings.tolist(),
    'message_encoded': message_list
})

# Prepare the data for the LSTM model
X = np.array(data['parsed_smell_identifier_embeddings'].tolist())
y = np.array(data['message_encoded'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LSTM model architecture
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])
        return output

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train).float()
y_train_tensor = torch.tensor(y_train)
X_test_tensor = torch.tensor(X_test).float()
y_test_tensor = torch.tensor(y_test)

# Reshape X_train_tensor and X_test_tensor to include the time steps dimension
X_train_tensor = X_train_tensor.unsqueeze(1)
X_test_tensor = X_test_tensor.unsqueeze(1)

# Create the LSTM model and move it to CPU
model = LSTMModel(X_train_tensor.shape[2], 128, len(np.unique(y_train)))
device = torch.device("cpu")
model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Create DataLoader objects
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# Training the LSTM model
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluation on the test set
with torch.no_grad():
    model.eval()
    X_test_tensor = X_test_tensor.to(device)
    y_test_tensor = y_test_tensor.to(device)
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs.data, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_tensor)

print(f"Accuracy: {accuracy:.2f}")

# Load the new ASTs from the CSV file
new_data = pd.read_csv('input.csv')

# Preprocess the new ASTs
new_ast_list = new_data['AST'].tolist()
preprocessed_ast_list = [ast.strip() for ast in new_ast_list]

# Convert the new ASTs into embeddings using the same sentence transformer model
model_bert = SentenceTransformer('bert-base-nli-mean-tokens')
new_ast_embeddings = model_bert.encode(preprocessed_ast_list, convert_to_tensor=True)

# Classify the new ASTs based on similarity to existing embeddings
for i, new_embedding in enumerate(new_ast_embeddings):
    similarities = cosine_similarity([new_embedding.numpy()], existing_embeddings)
    max_similarity = np.max(similarities)
    predicted_label = y[np.argmax(similarities)]
    
    if max_similarity >= 0.9:
        print(f"AST {i+1} belongs to message type: {predicted_label}")
    else:
        print(f"AST {i+1} does not belong to any known message type.")
        
# Compare the new AST embeddings with the existing embeddings
for i, embedding in enumerate(new_ast_embeddings):
    similarities = cosine_similarity([embedding.numpy()], X)
    max_similarity_index = np.argmax(similarities)
    max_similarity = np.max(similarities)
    predicted_message = message_encoding_vocabulary[y[max_similarity_index]]
    print(f"AST {i+1} belongs to smell type: {predicted_message} with similarity {max_similarity:.2f}")


Accuracy: 0.76
AST 1 belongs to message type: 2
AST 2 belongs to message type: 2
AST 3 belongs to message type: 0
AST 4 belongs to message type: 4
AST 5 belongs to message type: 2
AST 1 belongs to smell type: Refactor with similarity 0.96
AST 2 belongs to smell type: Refactor with similarity 0.96
AST 3 belongs to smell type: Adjustment with similarity 1.00
AST 4 belongs to smell type: Rename with similarity 0.98
AST 5 belongs to smell type: Refactor with similarity 0.98
