In [4]:
import pandas as pd
import re
from transformer import Transformer

In [6]:
def code_tokenizer(code):
    return re.findall(r'\s+|[a-zA-Z_]\w*|[^\s\w]', code)

In [8]:
# Load your dataset
merged_df = pd.read_csv("data/merged_df.csv")  # contains 'question' and 'code' columns
merged_df.shape

(16348, 2)

In [9]:
merged_df.head()

Unnamed: 0,question,code
0,Write a function to find squares of individual...,def square_nums(nums):\r\n square_nums = list(...
1,Create a function in Python which takes a list...,def sum_squares(nums):\n result = 0\n fo...
2,Design a function in Python that sorts a list ...,def sort_list_alphabetically(input_list): \n ...
3,Develop a greedy strategy in Python to find th...,def min_cost(points):\n points.sort() # so...
4,Write a Python program to create a singly link...,"class Node():\n def __init__(self, data):\n ..."


In [10]:
# Apply tokenization to both question and code
merged_df['question_tokens'] = merged_df['question'].apply(code_tokenizer)
merged_df['code_tokens'] = merged_df['code'].apply(code_tokenizer)

In [15]:
merged_df.head(1)

Unnamed: 0,question,code,question_tokens,code_tokens
0,Write a function to find squares of individual...,def square_nums(nums):\r\n square_nums = list(...,"[Write, , a, , function, , to, , find, , ...","[def, , square_nums, (, nums, ), :, \r\n , sq..."


In [31]:
# 3. Build vocabulary (from both question and code tokens)
from collections import Counter

all_tokens = merged_df['question_tokens'].explode().tolist() + merged_df['code_tokens'].explode().tolist()
token_freq = Counter(all_tokens)
vocab = {token: idx + 2 for idx, (token, _) in enumerate(token_freq.items())}  # +2 to reserve <pad>=0 and <unk>=1
vocab['<pad>'] = 0
vocab['<unk>'] = 1
print("vocab length : ", len(vocab))
vocab

vocab length :  23477


{'Write': 2,
 ' ': 3,
 'a': 4,
 'function': 5,
 'to': 6,
 'find': 7,
 'squares': 8,
 'of': 9,
 'individual': 10,
 'elements': 11,
 'in': 12,
 'list': 13,
 'using': 14,
 'lambda': 15,
 '.': 16,
 'Create': 17,
 'Python': 18,
 'which': 19,
 'takes': 20,
 'numbers': 21,
 'and': 22,
 'returns': 23,
 'the': 24,
 'sum': 25,
 'those': 26,
 'Design': 27,
 'that': 28,
 'sorts': 29,
 'strings': 30,
 'alphabetical': 31,
 'order': 32,
 'Develop': 33,
 'greedy': 34,
 'strategy': 35,
 'minimum': 36,
 'total': 37,
 'cost': 38,
 'connecting': 39,
 'n': 40,
 'given': 41,
 'points': 42,
 'on': 43,
 'D': 44,
 'plane': 45,
 'number': 46,
 'lines': 47,
 'program': 48,
 'create': 49,
 'singly': 50,
 'linked': 51,
 'GPT': 52,
 '-': 53,
 'model': 54,
 ',': 55,
 'for': 56,
 'shuffeling': 57,
 'an': 58,
 'array': 59,
 'Edit': 60,
 'following': 61,
 'code': 62,
 'such': 63,
 'it': 64,
 'will': 65,
 'print': 66,
 'containing': 67,
 'seven': 68,
 'days': 69,
 'week': 70,
 'calculator': 71,
 'script': 72,
 'two': 73

In [32]:
# Convert sentences to indexes
def integer_encoding(sentence, vocab):
    int_encoding = []
    for word in sentence:
        if word in vocab:
            int_encoding.append(vocab[word])
        else:
            int_encoding.append(vocab['<unknown>'])
    return int_encoding

In [33]:
merged_df['question_ids'] = merged_df['question_tokens'].apply(lambda x: integer_encoding(x, vocab))
merged_df['code_ids'] = merged_df['code_tokens'].apply(lambda x: integer_encoding(x, vocab))

In [36]:
merged_df['question_ids'].head(1)

Unnamed: 0,question,code,question_tokens,code_tokens,question_ids,code_ids
0,Write a function to find squares of individual...,def square_nums(nums):\r\n square_nums = list(...,"[Write, , a, , function, , to, , find, , ...","[def, , square_nums, (, nums, ), :, \r\n , sq...","[2, 3, 4, 3, 5, 3, 6, 3, 7, 3, 8, 3, 9, 3, 10,...","[2271, 3, 5509, 186, 3756, 187, 113, 5510, 550..."


In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Convert list of IDs to padded tensors
def prepare_batch(batch_df):
    q_ids = [torch.tensor(x) for x in batch_df['question_ids']]
    c_ids = [torch.tensor(x) for x in batch_df['code_ids']]

    q_ids_padded = pad_sequence(q_ids, batch_first=True, padding_value=vocab['<pad>'])
    c_ids_padded = pad_sequence(c_ids, batch_first=True, padding_value=vocab['<pad>'])

    return q_ids_padded, c_ids_padded


In [None]:
for batch_start in range(0, len(merged_df), batch_size):
    batch_df = merged_df.iloc[batch_start:batch_start+batch_size]
    src_batch, tgt_batch = prepare_batch(batch_df)

    output = model(src_batch.to(device), tgt_batch.to(device))
    # Continue training logic ...


In [None]:
import json
with open("vocab.json", "w") as f:
    json.dump(vocab, f)