In [2]:
!pip install transformers datasets sentencepiece
!pip install -q pytorch-lightning wandb

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd
import numpy as np

In [4]:
training_sample = pd.read_table("Datasets_PART2/SPoC/train/split/spoc-train-train.tsv", usecols=["text","code"])
test_sample = pd.read_table("Datasets_PART2/SPoC/train/split/spoc-train-test.tsv", usecols=["text","code"])
eval_sample = pd.read_table("Datasets_PART2/SPoC/train/split/spoc-train-eval.tsv", usecols=["text","code"])

training_sample = training_sample.dropna()
test_sample = test_sample.dropna()
eval_sample = eval_sample.dropna()

training_sample = training_sample.reset_index(drop=True)
test_sample = test_sample.reset_index(drop=True)
eval_sample = eval_sample.reset_index(drop=True)

training_sample = training_sample.iloc[:100000]
test_sample = test_sample.iloc[:15000]
eval_sample = eval_sample.iloc[:15000]

In [5]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")
max_input_length = 256
max_target_length = 128

def preprocess_samples(dataset):
    text = dataset["text"]
    code = dataset["code"]

    model_inputs = tokenizer(text, max_length = max_input_length, padding="max_length", truncation=True)
    labels = tokenizer(code, max_length=max_target_length, padding="max_length", truncation=True).input_ids

    labels_with_ignore_index = []
    for labels_sample in labels:
      labels_sample = [label if label != 0 else -100 for label in labels_sample]
      labels_with_ignore_index.append(labels_sample)

    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
from datasets import Dataset, load_dataset, DatasetDict
train = Dataset.from_dict(training_sample)
test = Dataset.from_dict(test_sample)
eval = Dataset.from_dict(eval_sample)

dataset = DatasetDict({"train" : train, "test": test,"eval": eval})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'code'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['text', 'code'],
        num_rows: 15000
    })
    eval: Dataset({
        features: ['text', 'code'],
        num_rows: 15000
    })
})

In [7]:
dataset = dataset.map(preprocess_samples, batched=True)
dataset

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    eval: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
})

In [8]:
from torch.utils.data import DataLoader

dataset.set_format(type="torch", columns=['input_ids','attention_mask','labels'])
train_dataloader = DataLoader(dataset['train'], batch_size=8)
valid_dataloader = DataLoader(dataset['eval'], batch_size=4)
test_dataloader = DataLoader(dataset['test'], batch_size=4)

In [9]:
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [10]:
batch = next(iter(train_dataloader))
print(batch.keys())
tokenizer.decode(batch['input_ids'][0])

dict_keys(['input_ids', 'attention_mask', 'labels'])


2023-03-20 22:28:41.321550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-20 22:28:42.771587: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-20 22:28:42.771723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


'create string s</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [11]:
labels = batch['labels'][0]
tokenizer.decode([label for label in labels if label != -100])

'string s;</s>'

In [12]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor



In [13]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [14]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [15]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [16]:
def train_func(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [17]:
# you need to set parameters 
train_file_path = "Datasets_PART2/SPoC/train/split/spoc-train-train.tsv"
model_name = 'gpt2'
output_dir = 'result/'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 100

In [18]:
train_func(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpaxx[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666930653300369, max=1.0)…

  0%|          | 0/34955 [00:00<?, ?it/s]

{'loss': 1.2984, 'learning_rate': 4.928479473608926e-05, 'epoch': 0.07}
{'loss': 0.9341, 'learning_rate': 4.8569589472178516e-05, 'epoch': 0.14}
{'loss': 0.886, 'learning_rate': 4.785438420826778e-05, 'epoch': 0.21}
{'loss': 0.8507, 'learning_rate': 4.713917894435703e-05, 'epoch': 0.29}
{'loss': 0.828, 'learning_rate': 4.6423973680446294e-05, 'epoch': 0.36}
{'loss': 0.8168, 'learning_rate': 4.5708768416535544e-05, 'epoch': 0.43}
{'loss': 0.8007, 'learning_rate': 4.499356315262481e-05, 'epoch': 0.5}
{'loss': 0.7932, 'learning_rate': 4.4278357888714065e-05, 'epoch': 0.57}
{'loss': 0.7779, 'learning_rate': 4.356315262480332e-05, 'epoch': 0.64}
{'loss': 0.7652, 'learning_rate': 4.284794736089258e-05, 'epoch': 0.72}


KeyboardInterrupt: 