In [53]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('all-mpnet-base-v2')

In [62]:
sentences = ['A',
    'Plus',
    """This function calculates the hyperbolic sine of the given value and returns the result as a float.

        Params:
            x (float): The value for which the hyperbolic sine will be calculated.

        Returns:
            float: The hyperbolic sine of the given value."""]
sentence_embeddings = sentence_model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [63]:
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)

    embedding_shape = embedding.shape
    print(embedding_shape)

    embedding_length = len(embedding)
    print(embedding_length)

    embedding_size = embedding.size
    print(embedding_size)

    # print("Embedding:", embedding)
    # print("")

Sentence: A
(768,)
768
768
Embedding: [ 3.53296846e-02 -2.05445923e-02 -1.83823844e-03 -2.71462575e-02
  6.16479339e-03 -3.60328034e-02  3.06082591e-02 -9.25765489e-04
  1.18018892e-02  1.45487553e-02  7.91234523e-02 -2.88334358e-02
  4.42319438e-02 -6.42942311e-03  6.98543014e-03 -4.52310257e-02
  3.18876803e-02 -1.87862180e-02 -5.66599034e-02 -6.34150133e-02
 -3.73920538e-02 -7.78477523e-04 -8.39884393e-03  7.32258603e-04
 -1.16894795e-02  1.20182363e-02  1.39553705e-02 -8.86995904e-03
 -6.52159750e-02 -2.12626830e-02 -4.83942218e-02  1.82901626e-03
 -9.32788011e-03 -4.24987786e-02  1.99232318e-06 -2.84662889e-03
 -2.76185083e-03 -1.53111881e-02 -2.29954869e-02  1.31616760e-02
 -3.28875822e-03  3.97803225e-02  3.50378491e-02 -1.17266355e-02
  6.96415594e-03 -3.04653328e-02  4.79897037e-02 -1.96443517e-02
 -3.15647088e-02 -1.55131053e-02 -2.44174413e-02 -6.24586307e-02
 -2.27751210e-02 -3.46547295e-03 -2.90139252e-03  1.62657611e-02
  6.08357042e-03  7.83892814e-03  2.95693185e-02 -5.

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import RobertaConfig
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")

config = RobertaConfig.from_pretrained('microsoft/graphcodebert-base', output_hidden_states=True)
model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base", config=config)


In [4]:
import functions_manager as fm
import torch

function_manager = fm.FunctionManager()
addition_func = function_manager.getNameToReference().get("prime_factors")
addition_func_str = function_manager.getFunctionAsStringWithoutDocString(addition_func)

# print(addition_func_str)
inputs = tokenizer(addition_func_str, return_tensors="pt")

outputs = model(**inputs)
hidden_states = outputs.hidden_states

logits = outputs.logits
# output_hidden_states = outputs.output_hidden_states
print(hidden_states[0].shape)
print(hidden_states[0].shape[1])
print(len(hidden_states))
print(logits.shape)
# print(hidden_states.size)
# print(len(hidden_states))
# print(outputs.logits)

# Taking max token length == 400


torch.Size([1, 226, 768])
226
13
torch.Size([1, 226, 50265])


In [11]:
# Add padding to hidden_state[0] to create [300*768] vector input
import torch
import torch.nn.functional as F



# Pad the tensor to the desired shape [300, 768]
padded_tensor = F.pad(hidden_states[0], (0, 0, 0, 300 - hidden_states[0].shape[1]))
reshaped_tensor = padded_tensor.squeeze()

print(reshaped_tensor.shape)

torch.Size([300, 768])


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

class SiameseNetwork(pl.LightningModule):
    def __init__(self, embedding_size=768):
        super(SiameseNetwork, self).__init__()
        self.embedding_size = embedding_size
        self.encoder = nn.Sequential(
            nn.Linear(300 * embedding_size, 3 * embedding_size),
            nn.ReLU(inplace=True),
            nn.Linear(3 * embedding_size, embedding_size)
        )

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        return self.encoder(x)

    def triplet_loss(self, anchor, positive, negative, margin=0.2):
        distance_positive = F.pairwise_distance(anchor, positive)
        distance_negative = F.pairwise_distance(anchor, negative)
        loss = torch.mean(torch.relu(distance_positive - distance_negative + margin))
        return loss

    def training_step(self, batch, batch_idx):
        anchor, positive, negative = batch
        anchor_embedding = self(anchor)
        positive_embedding = self(positive)
        negative_embedding = self(negative)

        loss = self.triplet_loss(anchor_embedding, positive_embedding, negative_embedding)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer

    def init_network(self, init_weights=True):
        if init_weights:
            self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight.data)
            if module.bias is not None:
                nn.init.constant_(module.bias.data, 0.0)

    def save_network(self, checkpoint_path):
        torch.save(self.state_dict(), checkpoint_path)

    def resume_training(self, checkpoint_path):
        self.load_state_dict(torch.load(checkpoint_path))

    def similarity_inference(self, input1, input2):
        embedding1 = self(input1)
        embedding2 = self(input2)
        similarity_score = F.pairwise_distance(embedding1, embedding2)
        return similarity_score

import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, num_samples, input_size):
        self.num_samples = num_samples
        self.input_size = input_size

    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        # Generate random dummy data
        anchor = torch.randn(self.input_size)
        # print(anchor.shape)
        positive = anchor
        negative = anchor
        return anchor, positive, negative

# Create a dummy dataset with 100 samples and input size [300, 768]
dataset = MyDataset(num_samples=100, input_size=[300, 768])

# Create a DataLoader for batching the data
batch_size = 8
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Create an instance of the SiameseNetwork
model = SiameseNetwork()

# Initialize the network
model.init_network()

# Train the network
trainer = pl.Trainer(max_epochs=10)
trainer.fit(model, dataloader)


In [5]:
model.save_network("siamese_network.pth")

In [4]:
import torch

# Create an instance of the SiameseNetwork
model = SiameseNetwork()

# Load the saved model state dict
model.load_state_dict(torch.load("siamese_network.pth"))

# Set the model to evaluation mode
model.eval()

FileNotFoundError: [Errno 2] No such file or directory: 'siamese_network.pth'

In [10]:
# Perform similarity inference
input1 = torch.randn(1, 300, 768)
input2 = torch.randn(1, 300, 768)
similarity_score = model.similarity_inference(input1, input2)
print("Similarity score:", similarity_score)

Similarity score: tensor([2.7713e-05], grad_fn=<NormBackward1>)


In [3]:
print(function_manager.getPositiveNegativeFunctionMap()["a_plus_b_whole_square"])

([<function MathFunctions.a_plus_b_whole_square at 0x15f939a20>, <function MathFunctions.a_squared_plus_2ab_plus_b_squared at 0x15f939ab0>, <function MathFunctions.a_minus_b_whole_squared_plus_4ab at 0x15f939b40>], [<function MathFunctions.addition at 0x15c12a290>, <function MathFunctions.subtraction at 0x15c12a170>, <function MathFunctions.multiplication at 0x15c12b0a0>, <function MathFunctions.division at 0x15c12b130>, <function MathFunctions.exponentiation at 0x15c12b1c0>, <function MathFunctions.absolute_value at 0x15c12b250>, <function MathFunctions.square_root at 0x15c12b2e0>, <function MathFunctions.floor_division at 0x15c12b370>, <function MathFunctions.modulus at 0x15c12b400>, <function MathFunctions.logarithm at 0x15c12b490>, <function MathFunctions.sine at 0x15c12b520>, <function MathFunctions.cosine at 0x15c12b5b0>, <function MathFunctions.tangent at 0x15c12b640>, <function MathFunctions.arcsine at 0x15c12b6d0>, <function MathFunctions.arccosine at 0x15c12b760>, <function M