Data Collection/Creation

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("Fsoft-AIC/the-vault-function", split_set=["train/small"], languages=['cpp'], trust_remote_code=True)

In [3]:
code_samples_large = dataset['train_small']['code']
docstring_samples_large = dataset['train_small']['short_docstring']

In [4]:
#Dataset is to large for my computer to handle :(
# Define the size of the reduced dataset
import random

reduced_size = int(len(code_samples_large) * 0.05)

# Create a list of indices for the samples in the original dataset
indices = list(range(len(code_samples_large)))

# Randomly shuffle the indices
random.shuffle(indices)

# Select the first 75% of the shuffled indices
selected_indices = indices[:reduced_size]

# Create the reduced dataset using the selected indices
code_samples = [code_samples_large[i] for i in selected_indices]
docstring_samples = [docstring_samples_large[i] for i in selected_indices]

In [5]:
print(len(code_samples))

4371


In [None]:
print(code_samples[0])
print(docstring_samples[0])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [None]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Data Preprocessing

In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphanumeric characters
    #We may want to keeps some non-alphanumeric characters as they can be crucial to generating certain code
    #text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join tokens back into string
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

Tokenization

In [None]:
def tokenize_cpp(code):
    # Remove comments and whitespace (tailored to cpp code)
    code = re.sub(r'//.*?\n|/\*.*?\*/', '', code, flags=re.DOTALL)
    code = re.sub(r'\s+', ' ', code)

    # Tokenize by splitting on whitespace and symbols
    tokens = re.findall(r'[\w]+|[^\w\s]', code)
    return ' '.join(tokens)

In [None]:
preprocessed_docstrings = [preprocess_text(docstring) for docstring in docstring_samples]

In [None]:
tokenized_code_samples = [tokenize_cpp(code) for code in code_samples]

In [None]:
print("Preprocessed: ", preprocessed_docstrings[0])
print("Tokenized: ", tokenized_code_samples[0])

Feature Extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
#CountVectorizer converts text into a matrix of token counts
#It tokenizes the text, builds a vocabulary of known words and encodes each text as a vector where each elements represents the count of a word
#Fit transform fits the model to the data and transforms the input text data into a sparse matrix representation
#TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic that reflects the importance of a 
#word in a text relatiob to the collection of texts

In [None]:
#Feature extraction of docstrings
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(preprocessed_docstrings)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [None]:
#Feature extraction of cpp code
count_vectorizer_cpp = CountVectorizer()
X_counts_cpp = count_vectorizer_cpp.fit_transform(tokenized_code_samples)
tfidf_transformer_cpp = TfidfTransformer()
X_tfidf_cpp = tfidf_transformer_cpp.fit_transform(X_counts_cpp)

In [None]:
print("Shape of transformed data:", X_tfidf.shape)
print("Shape of transformed data:", X_tfidf_cpp.shape)

In [None]:
from torch.nn import TransformerDecoder, TransformerDecoderLayer
import math

Decoder-only Transformer model

In [None]:
# Define the Decoder Model
class DecoderTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(DecoderTransformer, self).__init__()

        #Embedding layer converts input tokens into dense vectors
        self.embedding = nn.Embedding(input_size, hidden_size)

        #Adds positional information to input embeddings
        self.positional_encoding = PositionalEncoding(hidden_size)

        #Transformer decoder layer process the input sequence
        decoder_layers = TransformerDecoderLayer(hidden_size, nhead=8)

        #Combines multiple decoder layers
        self.transformer_decoder = TransformerDecoder(decoder_layers, num_layers)

        #Output layer transforms decoder ouputs into final output space
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, src, tgt, hidden_size):
        #Embedding and positional encoding for target input
        tgt = self.embedding(tgt) * math.sqrt(hidden_size)
        tgt = self.positional_encoding(tgt)

        #Processes the target input and source context
        output = self.transformer_decoder(tgt, src)

        #Final output transformation
        output = self.fc(output)
        return output

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
# Initialize an empty set to store unique tokens
vocabulary_set = set()
# Add tokens from each tokenized sample to the vocabulary set
for tokens in tokenized_code_samples:
    vocabulary_set.update(tokens)

vocabulary = sorted(list(vocabulary_set))
print(len(vocabulary))

In [None]:
#Instantiate the model
input_size = len(vocabulary)  # Size of the vocabulary
hidden_size = 32
num_layers = 6
output_size = len(vocabulary)  # Size of the vocabulary for output
model = DecoderTransformer(input_size, hidden_size, num_layers, output_size)

In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Convert TF-IDF matrices to tensors
#sample_size = 500
#X_subset = X_tfidf[:sample_size]
#y_subset = X_tfidf_cpp[:sample_size]

X_train_tensor = torch.tensor(X_tfidf.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(X_tfidf_cpp.toarray(), dtype=torch.int)

seq_length = X_train_tensor.shape[1]

X_train_tensor = X_train_tensor.unsqueeze(1).expand(-1, seq_length, -1)
#y_train_tensor = y_train_tensor.squeeze(1)

In [None]:
batch_size = 32

# Calculate the number of batches
num_batches = math.ceil(len(X_train_tensor) / batch_size)

In [None]:
# Training loop
#keeps breaking here no longer have any time to further debug 
num_epochs = 1
for epoch in range(num_epochs):
    total_loss = 0.0
    for i in range(num_batches):
        # Get batch indices
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(X_train_tensor))

        # Extract batch data
        batch_X = X_train_tensor[start_idx:end_idx]
        batch_y = y_train_tensor[start_idx:end_idx]

        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_X, batch_y, hidden_size)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Print training loss after each epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / num_batches}')

In [None]:
# Evaluation
with torch.no_grad():
    outputs = model(X_val_tensor, y_val_tensor)
    val_loss = criterion(outputs, y_val_tensor)
    print(f'Validation Loss: {val_loss.item()}')

In [None]:
torch.save(model.state_dict(), 'trained_model.pth')

In [None]:
def generate(description):
    generated_code = model.generate_code(description)

    return generated_code
    

In [6]:
import gradio as gr

In [7]:
with gr.Blocks(title="C++ Code Generation") as server:
    gr.Markdown("Specifiy the type of code you want to generate")
    with gr.Row():
        inp = gr.Textbox(label="Description", placeholder="Input Text here")
        out = gr.Textbox(label="Code")
    btn = gr.Button("Submit")
    btn.click(fn=generate, inputs=inp, outputs=out)

NameError: name 'model' is not defined

In [None]:
server.launch()