In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install gradio torch pandas scikit-learn tqdm

In [13]:
# C2P
import math
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import gradio as gr
from torch.cuda.amp import GradScaler, autocast

In [14]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8,
                 num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.pos_decoder = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers,
                                          dim_feedforward, dropout, batch_first=True)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask==0, float('-inf')).masked_fill(mask==1, float(0.0))
        return mask

    def forward(self, src, tgt):
        src_emb = self.src_embedding(src) * math.sqrt(self.d_model)
        src_emb = self.pos_encoder(src_emb)
        tgt_emb = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
        tgt_emb = self.pos_decoder(tgt_emb)
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1)).to(src.device)
        output = self.transformer(src_emb, tgt_emb, tgt_mask=tgt_mask)
        return self.fc_out(output)

In [15]:
PAD_TOKEN = "<pad>"
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"

def simple_tokenizer(text):
    return text.strip().split()

def build_vocab(sentences, min_freq=1):
    counts = Counter(token for sentence in sentences for token in sentence)
    vocab = {PAD_TOKEN: 0, SOS_TOKEN: 1, EOS_TOKEN: 2}
    idx = len(vocab)
    for token, count in counts.items():
        if count >= min_freq and token not in vocab:
            vocab[token] = idx
            idx += 1
    return vocab

def numericalize(sentence, vocab):
    return [vocab[SOS_TOKEN]] + [vocab.get(token, vocab[PAD_TOKEN]) for token in sentence] + [vocab[EOS_TOKEN]]

class PrepareDataset(Dataset):
    def __init__(self, data, src_vocab=None, tgt_vocab=None, build_vocabs=False):
        self.df = data.copy()
        self.df["code"] = self.df["code"].fillna("")
        self.df["text"] = self.df["text"].fillna("")
        self.df["src_tokens"] = self.df["code"].apply(simple_tokenizer)
        self.df["tgt_tokens"] = self.df["text"].apply(simple_tokenizer)

        if build_vocabs:
            self.src_vocab = build_vocab(self.df["src_tokens"].tolist())
            self.tgt_vocab = build_vocab(self.df["tgt_tokens"].tolist())
        else:
            self.src_vocab = src_vocab
            self.tgt_vocab = tgt_vocab

        self.df["src_indices"] = self.df["src_tokens"].apply(lambda tokens: numericalize(tokens, self.src_vocab))
        self.df["tgt_indices"] = self.df["tgt_tokens"].apply(lambda tokens: numericalize(tokens, self.tgt_vocab))
        self.data = list(zip(self.df["src_indices"].tolist(), self.df["tgt_indices"].tolist()))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_tensors = [torch.tensor(seq, dtype=torch.long) for seq in src_batch]
    tgt_tensors = [torch.tensor(seq, dtype=torch.long) for seq in tgt_batch]
    src_padded = pad_sequence(src_tensors, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_tensors, batch_first=True, padding_value=0)
    return src_padded, tgt_padded

In [16]:
train_df = pd.read_csv("/content/drive/MyDrive/PseudoCPP/Code2Pseudo/spoc-train.tsv", sep="\t")
eval_df = pd.read_csv("/content/drive/MyDrive/PseudoCPP/Code2Pseudo/spoc-train-eval.tsv", sep="\t")
test_df = pd.read_csv("/content/drive/MyDrive/PseudoCPP/Code2Pseudo/spoc-train-test.tsv", sep="\t")

print("Train Data Sample:")
print(train_df[["code", "text"]].head())

Train Data Sample:
                             code  \
0         int gcd(int a, int b) {   
1  return !b ? a : gcd(b, a % b);   
2                               }   
3                    int main() {   
4             int n, nn, ans = 0;   

                                              text  
0                in the function gcd(a,b=integers)  
1  if b=1 return a, else call function gcd(b, a%b)  
2                                              NaN  
3                                              NaN  
4               n , nn, ans = integers with ans =0  


In [18]:
train_dataset = PrepareDataset(train_df, build_vocabs=True)
eval_dataset = PrepareDataset(eval_df, train_dataset.src_vocab, train_dataset.tgt_vocab)
test_dataset = PrepareDataset(test_df, train_dataset.src_vocab, train_dataset.tgt_vocab)

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [19]:
import pickle

vocab_data = {
    "src_vocab": train_dataset.src_vocab,
    "tgt_vocab": train_dataset.tgt_vocab
}

# Save both vocabularies in one file
with open("/content/drive/MyDrive/PseudoCPP/Code2Pseudo/vocab.pkl", "wb") as f:
    pickle.dump(vocab_data, f)

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, scaler, device):
    model.train()
    total_loss = 0
    for src_batch, tgt_batch in tqdm(dataloader, desc="Training"):
        src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
        optimizer.zero_grad()
        tgt_input = tgt_batch[:, :-1]
        tgt_expected = tgt_batch[:, 1:]
        with autocast():
            output = model(src_batch, tgt_input)
            loss = criterion(output.reshape(-1, output.size(-1)), tgt_expected.reshape(-1))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src_batch, tgt_batch in tqdm(dataloader, desc="Evaluating"):
            src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
            tgt_input = tgt_batch[:, :-1]
            tgt_expected = tgt_batch[:, 1:]
            with autocast():
                output = model(src_batch, tgt_input)
                loss = criterion(output.reshape(-1, output.size(-1)), tgt_expected.reshape(-1))
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
model = Transformer(len(train_dataset.src_vocab), len(train_dataset.tgt_vocab)).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.src_vocab[PAD_TOKEN])
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scaler = GradScaler()

NUM_EPOCHS = 25
patience = 3
best_eval_loss = float('inf')
patience_counter = 0

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\nEpoch {epoch}")
    train_loss = train_epoch(model, train_loader, criterion, optimizer, scaler, DEVICE)
    eval_loss = evaluate(model, eval_loader, criterion, DEVICE)
    print(f"Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}")

    if eval_loss < best_eval_loss:
        best_eval_loss = eval_loss
        torch.save(model.state_dict(), "/content/drive/MyDrive/PseudoCPP/Code2Pseudo/transformer_code_to_pseudo.pth")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

print("Model saved as '/content/drive/MyDrive/PseudoCPP/Code2Pseudo/transformer_code_to_pseudo.pth'")

  scaler = GradScaler()



Epoch 1


  with autocast():
Training: 100%|██████████| 4592/4592 [05:05<00:00, 15.02it/s]
  with autocast():
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 67.26it/s]


Train Loss: 2.1491, Eval Loss: 1.3711

Epoch 2


Training: 100%|██████████| 4592/4592 [05:04<00:00, 15.10it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 67.32it/s]


Train Loss: 1.4344, Eval Loss: 1.1010

Epoch 3


Training: 100%|██████████| 4592/4592 [05:03<00:00, 15.14it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 67.55it/s]


Train Loss: 1.2064, Eval Loss: 0.9357

Epoch 4


Training: 100%|██████████| 4592/4592 [05:03<00:00, 15.14it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 64.36it/s]


Train Loss: 1.0543, Eval Loss: 0.8285

Epoch 5


Training: 100%|██████████| 4592/4592 [05:02<00:00, 15.18it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 66.68it/s]


Train Loss: 0.9408, Eval Loss: 0.7366

Epoch 6


Training: 100%|██████████| 4592/4592 [05:03<00:00, 15.15it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 67.31it/s]


Train Loss: 0.8482, Eval Loss: 0.6672

Epoch 7


Training: 100%|██████████| 4592/4592 [05:02<00:00, 15.16it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 67.69it/s]


Train Loss: 0.7749, Eval Loss: 0.6012

Epoch 8


Training: 100%|██████████| 4592/4592 [05:02<00:00, 15.17it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 67.64it/s]


Train Loss: 0.7129, Eval Loss: 0.5553

Epoch 9


Training: 100%|██████████| 4592/4592 [05:02<00:00, 15.20it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 67.42it/s]


Train Loss: 0.6604, Eval Loss: 0.5078

Epoch 10


Training: 100%|██████████| 4592/4592 [05:01<00:00, 15.21it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 66.77it/s]


Train Loss: 0.6163, Eval Loss: 0.4818

Epoch 11


Training: 100%|██████████| 4592/4592 [05:01<00:00, 15.23it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 65.65it/s]


Train Loss: 0.5788, Eval Loss: 0.4507

Epoch 12


Training: 100%|██████████| 4592/4592 [05:01<00:00, 15.22it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 64.49it/s]


Train Loss: 0.5481, Eval Loss: 0.4293

Epoch 13


Training: 100%|██████████| 4592/4592 [05:01<00:00, 15.23it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 63.70it/s]


Train Loss: 0.5225, Eval Loss: 0.4149

Epoch 14


Training: 100%|██████████| 4592/4592 [05:02<00:00, 15.19it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 63.61it/s]


Train Loss: 0.5027, Eval Loss: 0.4000

Epoch 15


Training: 100%|██████████| 4592/4592 [05:01<00:00, 15.21it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 63.34it/s]


Train Loss: 0.4839, Eval Loss: 0.3857

Epoch 16


Training: 100%|██████████| 4592/4592 [05:01<00:00, 15.21it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 68.12it/s]


Train Loss: 0.4683, Eval Loss: 0.3733

Epoch 17


Training: 100%|██████████| 4592/4592 [05:01<00:00, 15.23it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 68.00it/s]


Train Loss: 0.4550, Eval Loss: 0.3636

Epoch 18


Training: 100%|██████████| 4592/4592 [05:02<00:00, 15.18it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 65.03it/s]


Train Loss: 0.4456, Eval Loss: 0.3543

Epoch 19


Training: 100%|██████████| 4592/4592 [05:01<00:00, 15.22it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 64.41it/s]


Train Loss: 0.4353, Eval Loss: 0.3461

Epoch 20


Training: 100%|██████████| 4592/4592 [05:02<00:00, 15.19it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 65.77it/s]


Train Loss: 0.4270, Eval Loss: 0.3378

Epoch 21


Training: 100%|██████████| 4592/4592 [05:02<00:00, 15.19it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 67.22it/s]


Train Loss: 0.4188, Eval Loss: 0.3326

Epoch 22


Training: 100%|██████████| 4592/4592 [05:02<00:00, 15.19it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 67.57it/s]


Train Loss: 0.4116, Eval Loss: 0.3261

Epoch 23


Training: 100%|██████████| 4592/4592 [04:59<00:00, 15.35it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 68.02it/s]


Train Loss: 3.3031, Eval Loss: 5.6323

Epoch 24


Training: 100%|██████████| 4592/4592 [04:00<00:00, 19.07it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 70.22it/s]


Train Loss: nan, Eval Loss: nan

Epoch 25


Training: 100%|██████████| 4592/4592 [03:10<00:00, 24.15it/s]
Evaluating: 100%|██████████| 427/427 [00:06<00:00, 68.11it/s]

Train Loss: nan, Eval Loss: nan
Early stopping triggered.
Model saved as '/content/drive/MyDrive/transformer_code_to_pseudo.pth'





In [7]:
def generate_output(model, src_sentence, src_vocab, tgt_vocab, device, max_len=50):
    model.eval()
    tokens = simple_tokenizer(src_sentence)
    src_indices = numericalize(tokens, src_vocab)
    src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
    tgt_indices = [tgt_vocab[SOS_TOKEN]]
    with torch.no_grad():
        for _ in range(max_len):
            tgt_tensor = torch.tensor(tgt_indices, dtype=torch.long).unsqueeze(0).to(device)
            output = model(src_tensor, tgt_tensor)
            next_token = torch.argmax(output[0, -1, :]).item()
            tgt_indices.append(next_token)
            if next_token == tgt_vocab[EOS_TOKEN]:
                break
    inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    return " ".join([inv_tgt_vocab[idx] for idx in tgt_indices if idx not in (tgt_vocab[SOS_TOKEN], tgt_vocab[EOS_TOKEN])])

In [None]:
import pandas as pd
train_df = pd.read_csv("spoc-train.tsv", sep="\t")
train_dataset = PrepareDataset(train_df, build_vocabs=True)
print(train_dataset.src_vocab)

{'<pad>': 0, '<sos>': 1, '<eos>': 2, 'int': 3, 'gcd(int': 4, 'a,': 5, 'b)': 6, '{': 7, 'return': 8, '!b': 9, '?': 10, 'a': 11, ':': 12, 'gcd(b,': 13, '%': 14, 'b);': 15, '}': 16, 'main()': 17, 'n,': 18, 'nn,': 19, 'ans': 20, '=': 21, '0;': 22, 'cin': 23, '>>': 24, 'n;': 25, 'for': 26, '(int': 27, 'i': 28, '2;': 29, '<=': 30, 'n': 31, '-': 32, '1;': 33, '++i)': 34, 'nn': 35, 'while': 36, '(nn)': 37, '+=': 38, 'i,': 39, '/=': 40, 'i;': 41, 'o': 42, 'gcd(ans,': 43, '2);': 44, 'cout': 45, '<<': 46, '/': 47, '"/"': 48, '(n': 49, '2)': 50, '"\\n";': 51, 'string': 52, 'b;': 53, 'set<string>': 54, 'st;': 55, '<': 56, 'i++)': 57, 'st.insert(a': 58, '+': 59, '"': 60, 'st.size()': 61, 'endl;': 62, 'm,': 63, 'su': 64, '0,': 65, 'su2': 66, 'b,': 67, 'c;': 68, 'm': 69, '>': 70, '0)': 71, 'm;': 72, 'c': 73, 'su;': 74, 'b': 75, '(su2': 76, 'su2;': 77, 'gcd1(int': 78, 'if': 79, '(a': 80, '==': 81, 'gcd1(b': 82, 'a);': 83, 'long': 84, 'modx(long': 85, 'base,': 86, 'ex)': 87, '1LL,': 88, 'val': 89, 'base

In [20]:
def generate_output(model, src_sentence, src_vocab, tgt_vocab, device, max_len=50):
    model.eval()
    tokens = simple_tokenizer(src_sentence)
    src_indices = numericalize(tokens, src_vocab)
    src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
    tgt_indices = [tgt_vocab[SOS_TOKEN]]
    with torch.no_grad():
        for _ in range(max_len):
            tgt_tensor = torch.tensor(tgt_indices, dtype=torch.long).unsqueeze(0).to(device)
            output = model(src_tensor, tgt_tensor)
            next_token = torch.argmax(output[0, -1, :]).item()
            tgt_indices.append(next_token)
            if next_token == tgt_vocab[EOS_TOKEN]:
                break
    inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    return " ".join([inv_tgt_vocab[idx] for idx in tgt_indices if idx not in (tgt_vocab[SOS_TOKEN], tgt_vocab[EOS_TOKEN])])

with open("/content/drive/MyDrive/PseudoCPP/Code2Pseudo/vocab.pkl", "rb") as f:
    vocab_data = pickle.load(f)

# Extract src and tgt vocabs
src_vocab = vocab_data["src_vocab"]
tgt_vocab = vocab_data["tgt_vocab"]
train_dataset = PrepareDataset(train_df, build_vocabs=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(len(src_vocab), len(tgt_vocab)).to(DEVICE)
model.load_state_dict(torch.load("/content/drive/MyDrive/PseudoCPP/Code2Pseudo/transformer_code_to_pseudo.pth", map_location=DEVICE))
model.eval()

def generate_pseudocode(code):
    return generate_output(model, code, src_vocab, tgt_vocab, DEVICE)

demo = gr.Interface(
    fn=generate_pseudocode,
    inputs=gr.Textbox(lines=5, placeholder="Enter C++ code here..."),
    outputs=gr.Textbox(label="Generated Pseudocode"),
    title="C++ Code to Pseudocode Converter",
    description="Input C++ code to generate corresponding pseudocode."
)
demo.launch(share=True)

  model.load_state_dict(torch.load("/content/drive/MyDrive/PseudoCPP/Code2Pseudo/transformer_code_to_pseudo.pth", map_location=DEVICE))


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://320a0e6b2cb2de0294.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


