In [53]:
import pandas as pd
import re
from transformer import Transformer

In [54]:
def code_tokenizer(code):
    return re.findall(r'\s+|[a-zA-Z_]\w*|[^\s\w]', code)

In [55]:
# Load your dataset
merged_df = pd.read_csv("data/merged_df.csv")  # contains 'question' and 'code' columns
print(merged_df.shape)

merged_df = merged_df[merged_df['question'].apply(lambda x : len(x.split(" "))) <= 48 ]
merged_df.shape

(16348, 2)


(16336, 2)

In [56]:
merged_df.head()

Unnamed: 0,question,code
0,Write a function to find squares of individual...,def square_nums(nums):\r\n square_nums = list(...
1,Create a function in Python which takes a list...,def sum_squares(nums):\n result = 0\n fo...
2,Design a function in Python that sorts a list ...,def sort_list_alphabetically(input_list): \n ...
3,Develop a greedy strategy in Python to find th...,def min_cost(points):\n points.sort() # so...
4,Write a Python program to create a singly link...,"class Node():\n def __init__(self, data):\n ..."


In [57]:
# Apply tokenization to both question and code
merged_df['question_tokens'] = merged_df['question'].apply(lambda x : x.split(" "))
merged_df['code_tokens'] = merged_df['code'].apply(code_tokenizer)

# filtering
merged_df = merged_df[merged_df['code_tokens'].apply(lambda x : len(x)) <= 200]

In [58]:
merged_df.head(1)

Unnamed: 0,question,code,question_tokens,code_tokens
0,Write a function to find squares of individual...,def square_nums(nums):\r\n square_nums = list(...,"[Write, a, function, to, find, squares, of, in...","[def, , square_nums, (, nums, ), :, \r\n , sq..."


In [59]:
# 3. Build vocabulary (from both question and code tokens)
from collections import Counter

# vocab for Question
question_tokens = merged_df['question_tokens'].explode().tolist() 
question_token_freq = Counter(question_tokens)

question_vocab = {token: idx + 2 for idx, (token, _) in enumerate(question_token_freq.items())}  # +2 to reserve <pad>=0 and <unk>=1
question_vocab['<pad>'] = 0
question_vocab['<unk>'] = 1
print("vocab length : ", len(question_vocab))

# Vocab for Code
code_tokens = merged_df['code_tokens'].explode().tolist()
code_token_freq = Counter(code_tokens)

code_vocab = {token: idx + 1 for idx, (token, _) in enumerate(code_token_freq.items())}  # +1 to reserve <pad>=0 
code_vocab['<pad>'] = 0
# code_vocab['<unk>'] = 1
print("vocab length : ", len(code_vocab))

vocab length :  8522
vocab length :  18560


In [60]:
# Convert sentences to indexes
def integer_encoding(sentence, vocab):
    int_encoding = []
    for word in sentence:
        if word in vocab:
            int_encoding.append(vocab[word])
        else:
            int_encoding.append(vocab['<unknown>'])
    return int_encoding

In [61]:
merged_df['question_ids'] = merged_df['question_tokens'].apply(lambda x: integer_encoding(x, question_vocab))
merged_df['code_ids'] = merged_df['code_tokens'].apply(lambda x: integer_encoding(x, code_vocab))

In [62]:
merged_df['question_ids'].head(1)

0    [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 3, 12, 13, 14...
Name: question_ids, dtype: object

In [63]:
merged_df.head(1)

Unnamed: 0,question,code,question_tokens,code_tokens,question_ids,code_ids
0,Write a function to find squares of individual...,def square_nums(nums):\r\n square_nums = list(...,"[Write, a, function, to, find, squares, of, in...","[def, , square_nums, (, nums, ), :, \r\n , sq...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 3, 12, 13, 14...","[1, 2, 3, 4, 5, 6, 7, 8, 3, 2, 9, 2, 10, 4, 11..."


In [64]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Pad question_ids
question_tensors = [torch.tensor(seq, dtype=torch.long) for seq in merged_df['question_ids']]
padded_question_ids = pad_sequence(question_tensors, batch_first=True, padding_value=0)

# Pad code_ids
code_tensors = [torch.tensor(seq, dtype=torch.long) for seq in merged_df['code_ids']]
padded_code_ids = pad_sequence(code_tensors, batch_first=True, padding_value=0)

In [65]:
type(padded_question_ids)

torch.Tensor

In [66]:
from torch.utils.data import Dataset

class QACodeDataset(Dataset):
    def __init__(self, padded_question_ids, padded_code_ids):
        self.questions = padded_question_ids
        self.codes = padded_code_ids

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        code = self.codes[idx]
        return question, code

In [67]:
from torch.utils.data import DataLoader

dataset = QACodeDataset(padded_question_ids , padded_code_ids)
dataloader = DataLoader(dataset , batch_size = 32 , shuffle = True)

# Model

In [68]:
model= Transformer(
    embed_dim=512,
    src_vocab_size=len(question_vocab),
    target_vocab_size=len(code_vocab),
    en_seq_length=48,        # or more if needed
    de_seq_length=200,        # or more if needed
    num_layers=6,
    expansion_factor=4,   # because 512 * 4 = 2048
    n_heads=8
)

In [69]:
model

Transformer(
  (encoder): TransformerEncoder(
    (embedding_layer): Embedding(
      (embed): Embedding(8522, 512)
    )
    (positional_encoder): PositionalEmbedding()
    (layers): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadAttention(
          (query_matrix): Linear(in_features=512, out_features=512, bias=False)
          (key_matrix): Linear(in_features=512, out_features=512, bias=False)
          (value_matrix): Linear(in_features=512, out_features=512, bias=False)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout1): Dropout(p=0.1, inplace=False)
        (dro