In [1]:
!pip install transformers datasets sentencepiece
!pip install -q pytorch-lightning wandb
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [1]:
from pytorch_lightning import Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
import pandas as pd

2023-05-08 02:05:51.349582: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-08 02:05:53.171477: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-08 02:05:53.171721: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


#### parameters

In [2]:
train_file_path = "Datasets_PART2/SPoC/train/split/spoc-train-train.tsv"
test_file_path = "Datasets_PART2/SPoC/train/split/spoc-train-test.tsv"
eval_file_path = "Datasets_PART2/SPoC/train/split/spoc-train-eval.tsv"
model_name = 'gpt2'
output_dir = 'result/'
overwrite_output_dir = True
per_device_train_batch_size = 1
num_train_epochs = 5.0
save_steps = 20000
max_input_length = 256
max_target_length = 128

#### load datasets

In [5]:
training_sample = pd.read_table(train_file_path, usecols=["text","code"])
test_sample = pd.read_table(test_file_path, usecols=["text","code"])
eval_sample = pd.read_table(eval_file_path, usecols=["text","code"])

training_sample = training_sample.dropna()
test_sample = test_sample.dropna()
eval_sample = eval_sample.dropna()

training_sample = training_sample.reset_index(drop=True)
test_sample = test_sample.reset_index(drop=True)
eval_sample = eval_sample.reset_index(drop=True)

bleu_sample = training_sample[100000:]
bleu_sample = bleu_sample[:1000]
bleu_test_set = bleu_sample['code']
bleu_refs = bleu_sample['text']
bleu_test_set = bleu_test_set.to_numpy()
bleu_refs = bleu_refs.to_numpy()

training_sample = training_sample.iloc[:100000]
test_sample = test_sample.iloc[:15000]
eval_sample = eval_sample.iloc[:15000]

#### preprocess dataset

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(output_dir)


def preprocess_samples(dataset):
    text = dataset["text"]
    code = dataset["code"]

    model_inputs = tokenizer(code, max_length = max_input_length, padding="max_length", truncation=True)
    labels = tokenizer(text, max_length=max_target_length, padding="max_length", truncation=True).input_ids
    
    labels_with_ignore_index = []
    for labels_sample in labels:
        labels_sample = [label if label != 0 else -100 for label in labels_sample]
        labels_with_ignore_index.append(labels_sample)
  
    model_inputs["labels"] = labels_with_ignore_index
    return model_inputs

In [7]:
from datasets import Dataset, load_dataset, DatasetDict
train = Dataset.from_dict(training_sample)
test = Dataset.from_dict(test_sample)
eval = Dataset.from_dict(eval_sample)

dataset = DatasetDict({"train" : train, "test": test,"eval": eval})
dataset = dataset.map(preprocess_samples, batched=True)
dataset

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    eval: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
})

In [24]:
dataset.set_format(type="torch", columns=['input_ids','attention_mask','labels'])
train_dataloader = DataLoader(dataset['train'], batch_size=8)
valid_dataloader = DataLoader(dataset['eval'], batch_size=4)
test_dataloader = DataLoader(dataset['test'], batch_size=4)

#### training

In [25]:
def train_func(train_file_path,
                model_name,
                output_dir,
                overwrite_output_dir,
                per_device_train_batch_size,
                num_train_epochs,
                save_steps):
        
        model = GPT2LMHeadModel.from_pretrained(model_name)
        model.save_pretrained(output_dir)

        training_args = TrainingArguments(
                output_dir=output_dir,
                overwrite_output_dir=overwrite_output_dir,
                per_device_train_batch_size=per_device_train_batch_size,
                num_train_epochs=num_train_epochs,
                save_steps=save_steps,
            )

        trainer = Trainer(
                model=model,
                args=training_args,
                data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
                train_dataset=train_dataloader,
        )
        
        trainer.train()
        trainer.save_model()

In [None]:
train_func(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

In [8]:
model = GPT2LMHeadModel.from_pretrained(output_dir)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

#### test generations

In [3]:
def generate_pseudocode(input_code, model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    input_tokens = tokenizer.encode(input_code, return_tensors='pt')

    output_tokens = model.generate(
        input_ids=input_tokens,
        max_length=128,
        temperature=0.7,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return output_text

In [4]:
input_code = "int add(int a, int b){ return a + b; }"
output = generate_pseudocode(input_code=input_code, model_path=output_dir)
print(output+"\n")
pseudo_code = output.split("/t")[0]
print(pseudo_code)

int add(int a, int b){ return a + b; }	54	267A	8491894	1	1
return b	return b;	54	267A	8491894	2	1
	}	54	267A	8491894	3	0
	int main() {	54	267A	8491894	4	0
declare integer variables n and m	int n, m;	54	267A	8491894	5	1
read n and m	cin >> n >> m;	54	267A	8491894	6	1
declare integer variable called count = 0	int count = 0;	54	267A	8491894	7	1
for i = 0 to n exclusive	for (int i = 0; i < n; i++) {	54	267A	8491894	8	8	1
declare integer called counta = 0	int counta = 0;	54	267A	8491894	9	2
declare integer called countb = 0	int countb = 0;	

int add(int a, int b){ return a + b; }	54	267A	8491894	1	1
return b	return b;	54	267A	8491894	2	1
	}	54	267A	8491894	3	0
	int main() {	54	267A	8491894	4	0
declare integer variables n and m	int n, m;	54	267A	8491894	5	1
read n and m	cin >> n >> m;	54	267A	8491894	6	1
declare integer variable called count = 0	int count = 0;	54	267A	8491894	7	1
for i = 0 to n exclusive	for (int i = 0; i < n; i++) {	54	267A	8491894	8	8	1
declare integer called counta = 0	in

#### evaluation metrics

In [9]:
from nltk.translate.bleu_score import corpus_bleu

test_set = bleu_test_set.tolist()

references = bleu_refs.tolist()

test_encodings = tokenizer(test_set, max_length=max_input_length, truncation=True, padding=True, return_tensors="pt")

predicted_ids = model.generate(test_encodings["input_ids"], attention_mask=test_encodings["attention_mask"])
predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
print(predicted_texts)

bleu_score = corpus_bleu([[ref] for ref in references], predicted_texts)
print("BLEU score:", bleu_score)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 32, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


['while (n--) {01', 'int a;a', 'cin >> a;cin', 'if (a >= 0)a', 'sum += a;sum', 'else a', 'sum -= a;01', 'cout << sum << endl;\t', 'int n;n', 'cin >> n;cin', 'int x;31', 'int b = 0, c = 0;while', 'for (int i = 0; i < n; ++i) {for', 'cin >> x;cin', 'if (x < 0) {min', 'b = b + x;a', '} else {54', 'c = c + x;\t', 'cout << c - b << endl;\t', 'inline int read() {inline', 'int x = 0, f = 1;while', 'char ch = getchar();an', "while (ch < '0' || ch > '9') {while", "if (ch == '-') f = -1;\t", 'ch = getchar();an', "while (ch >= '0' && ch <= '9') {while", "x = 10 * x + ch - '0';\t", 'ch = getchar();an', 'return x * f;01', 'int n;n', 'n = read();\t', 'int sum = 0;a', 'while (n--) {01', 'int h = read();\t', 'if (h > 0)ss', 'sum += h;sum', 'else a', 'sum -= h;sum', 'cout << sum << endl;\t', 'int n;n', 'cin >> n;cin', 'int a;a', 'int sum = 0;a', 'int cnt = 0;a', 'while (n--) {01', 'cin >> a;cin', 'if (a > 0)a', 'sum += a;sum', 'else a', 'cnt += a;a', 'cout << sum - cnt << endl;\t', 'int a[102];\t', 'in

In [11]:
from rouge import Rouge

test_set = bleu_test_set.tolist()

references = bleu_refs.tolist()

test_encodings = tokenizer(test_set, max_length=max_input_length, truncation=True, padding=True, return_tensors="pt")

predicted_ids = model.generate(test_encodings["input_ids"], attention_mask=test_encodings["attention_mask"])
predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)

rouge = Rouge()
rouge_scores = rouge.get_scores(predicted_texts, references, avg=True, ignore_empty=True)

print("ROUGE-1 score:", rouge_scores["rouge-1"]["f"])
print("ROUGE-2 score:", rouge_scores["rouge-2"]["f"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 32, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


ROUGE-1 score: 0.3097778333694708
ROUGE-2 score: 0.12138070523932558
