In [14]:
%pip install transformers datasets sentencepiece pandas
%pip install -q pytorch-lightning wandb

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import numpy as np

In [16]:
training_sample = pd.read_table("/home/sopmod/clgp/sem6/spoc/train/split/spoc-train-train.tsv", usecols=["text","code"])
test_sample = pd.read_table("/home/sopmod/clgp/sem6/spoc/train/split/spoc-train-test.tsv", usecols=["text","code"])
eval_sample = pd.read_table("/home/sopmod/clgp/sem6/spoc/train/split/spoc-train-eval.tsv", usecols=["text","code"])

training_sample = training_sample.dropna()
test_sample = test_sample.dropna()
eval_sample = eval_sample.dropna()

training_sample = training_sample.reset_index(drop=True)
test_sample = test_sample.reset_index(drop=True)
eval_sample = eval_sample.reset_index(drop=True)

bleu_sample = training_sample[100000:]
bleu_sample = bleu_sample[:1000]
bleu_test_set = bleu_sample['code']
bleu_refs = bleu_sample['text']
bleu_test_set = bleu_test_set.to_numpy()
bleu_refs = bleu_refs.to_numpy()
print(bleu_refs[:5])


training_sample = training_sample.iloc[:100000]
test_sample = test_sample.iloc[:15000]
eval_sample = eval_sample.iloc[:15000]

['loop n times' 'a = int' 'read a' 'if a >= 0' 'increment sum by a']


In [6]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")
max_input_length = 256
max_target_length = 128

def preprocess_samples(dataset):
    text = dataset["text"]
    code = dataset["code"]

    model_inputs = tokenizer(code, max_length = max_input_length, padding="max_length", truncation=True)
    labels = tokenizer(text, max_length=max_target_length, padding="max_length", truncation=True).input_ids

    labels_with_ignore_index = []
    for labels_sample in labels:
      labels_sample = [label if label != 0 else -100 for label in labels_sample]
      labels_with_ignore_index.append(labels_sample)

    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs



  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from datasets import Dataset, load_dataset, DatasetDict
train = Dataset.from_dict(training_sample)
test = Dataset.from_dict(test_sample)
eval = Dataset.from_dict(eval_sample)

dataset = DatasetDict({"train" : train, "test": test,"eval": eval})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'code'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['text', 'code'],
        num_rows: 15000
    })
    eval: Dataset({
        features: ['text', 'code'],
        num_rows: 15000
    })
})

In [7]:
dataset = dataset.map(preprocess_samples, batched=True)
dataset

                                                                     

DatasetDict({
    train: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    eval: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
})

In [8]:
from torch.utils.data import DataLoader

dataset.set_format(type="torch", columns=['input_ids','attention_mask','labels'])
train_dataloader = DataLoader(dataset['train'], batch_size=8)
valid_dataloader = DataLoader(dataset['eval'], batch_size=4)
test_dataloader = DataLoader(dataset['test'], batch_size=4)

In [9]:
batch = next(iter(train_dataloader))
print(batch.keys())
print(train_dataloader)

dict_keys(['input_ids', 'attention_mask', 'labels'])
<torch.utils.data.dataloader.DataLoader object at 0x7f4065dd2e60>


In [10]:
tokenizer.decode(batch['input_ids'][0])

'string s;</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <

In [11]:
labels = batch['labels'][0]
tokenizer.decode([label for label in labels if label != -100])

'create string s</s>'

In [17]:
from transformers import T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl

class T5Model(pl.LightningModule):
  def __init__(self, lr=5e-5, num_train_epochs=15, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
        self.save_hyperparameters()

  def forward(self, input_ids, attention_mask, labels=None):     
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs
    
  def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss
      
  def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)

        return loss

  def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)

        return loss

  def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     

        return loss

  def configure_optimizers(self):
        # create optimizer
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        # create learning rate scheduler
        num_train_optimization_steps = self.hparams.num_train_epochs * len(train_dataloader)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.hparams.warmup_steps,
                                                    num_training_steps=num_train_optimization_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}
        
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

  def train_dataloader(self):
        return train_dataloader

  def val_dataloader(self):
        return valid_dataloader

  def test_dataloader(self):
        return test_dataloader

In [18]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msopmod[0m ([33mthreekids[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [18]:
model = T5Model()

In [20]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

wandb_logger = WandbLogger(name='tf-logger', project='T5Model')

early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = Trainer(
                  default_root_dir="~/Desktop", 
                  logger=wandb_logger, 
                  callbacks=[early_stop_callback, lr_monitor],
                  max_epochs=4)
trainer.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [10]:
# save_directory = "." # save in the current working directory, you can change this of course
# model.model.save_pretrained(save_directory)

In [19]:
model = model.model.from_pretrained(".")
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [20]:
#test_text = "string s; s = '12345'; for(int i = 0; i < s.length(); i++) { cout << s[i] << endl;}"
test_text = "int a = 10; int b = 20; cout << a * b << endl;"
input_code_encodings = tokenizer(test_text, max_length = max_input_length, return_tensors="pt")
output_ids = model.generate(input_code_encodings["input_ids"], attention_mask=input_code_encodings["attention_mask"])
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output_text)




a = 10; b = 20; print a * b


In [21]:
from nltk.translate.bleu_score import corpus_bleu

test_set = bleu_test_set.tolist()

references = bleu_refs.tolist()

test_encodings = tokenizer(test_set, max_length=max_input_length, truncation=True, padding=True, return_tensors="pt")

predicted_ids = model.generate(test_encodings["input_ids"], attention_mask=test_encodings["attention_mask"])
predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
print(predicted_texts)

bleu_score = corpus_bleu([[ref] for ref in references], predicted_texts)
print("BLEU score:", bleu_score)



['while decrement n', 'create integer a', 'read a', 'if a >= 0', 'increment sum by a', 'else', 'decrement sum by a', 'print sum', 'create integer n', 'read n', 'create integer x', 'create integers b, c with b = 0, c = 0', 'for i = 0 to n exclusive', 'read x', 'if x is less than 0', 'set b to b + x', 'else', 'set c to c + x', 'print c - b', 'in function read that returns integer', 'create integers x, f with x = 0, f = 1', 'ch = character with ch = getchar()', 'while ch is less than 0 or ch is greater than 9', "if ch is '-', set f to -1", 'ch = read character', "while ch >= '0' and ch = '9'", "set x to 10 * x + ch - '0'", 'ch = read character', 'return x * f', 'create integer n', 'n = read()', 'create integer sum with sum = 0', 'while decrement n', 'h = integer with h = read()', 'if h is greater than 0', 'increment sum by h', 'else', 'decrement sum by h', 'print sum', 'create integer n', 'read n', 'create integer a', 'create integer sum with sum = 0', 'cnt = integer with cnt = 0', 'while

In [22]:
%pip install rouge

Note: you may need to restart the kernel to use updated packages.


In [23]:
from rouge import Rouge

test_set = bleu_test_set.tolist()

references = bleu_refs.tolist()

test_encodings = tokenizer(test_set, max_length=max_input_length, truncation=True, padding=True, return_tensors="pt")

predicted_ids = model.generate(test_encodings["input_ids"], attention_mask=test_encodings["attention_mask"])
predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)

rouge = Rouge()
rouge_scores = rouge.get_scores(predicted_texts, references, avg=True, ignore_empty=True)

print("ROUGE-1 score:", rouge_scores["rouge-1"]["f"])
print("ROUGE-2 score:", rouge_scores["rouge-2"]["f"])

ROUGE-1 score: 0.7064574480411799
ROUGE-2 score: 0.4582567689951759


In [23]:
folder_path = "../dataset/"

In [28]:
import os
files = os.listdir(folder_path)
files.sort()
txt_data = []
for file in files:
    file_path = os.path.join(folder_path, file)
    if file.lower().endswith(".txt") and ("e" not in file):
        try:
            with open(file_path, 'r') as f:
                data = f.read()
                txt_data.append({"name":file, "data":data})
        except Exception as e:
            print("Error reading file: ", file, " : ", e)
txt_data[0]

{'name': '1.txt',
 'data': '/*\n * C Program to Check Number is Palindrome or Not using While Loop.\n */\n \n#include <stdio.h>\nint main()\n{\n    int n, rev = 0;\n    printf("Enter the number: ");\n    scanf("%d", &n);\n \n    int num= n;  //To store the original number in the variable num\n \n    //Reverse the number and store it in variable rev\n    while (n > 0)\n    {\n        rev = rev * 10 + n % 10;\n        n = n / 10;\n    }\n \n    // check if original number is same as reversed number or not\n    if (num == rev)\n        printf("%d is a palindrome number.", num);\n    else\n        printf("%d is not a palindrome number.", num);\n \n    return 0;\n}'}

In [31]:
def generate_pseudo(test_text):
    file_path = "./outputs/" + test_text["name"]
    input_code_encodings = tokenizer(test_text['data'], max_length=max_input_length, return_tensors="pt")
    output_ids = model.generate(input_code_encodings["input_ids"],max_new_tokens=1024 ,attention_mask=input_code_encodings["attention_mask"])
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    with open(file_path, 'w') as f:
        f.write(output_text)
for d in txt_data:
    generate_pseudo(d)