In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets einops wandb
!pip install evaluate
!pip install rouge_score

In [None]:
from datasets import load_dataset
from transformers import RobertaTokenizer
from transformers import DataCollatorForSeq2Seq
from peft import LoraConfig, LoraModel, PeftModel, get_peft_model
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import EarlyStoppingCallback

In [None]:
LORA_ALPHA = 16
LORA_DROPOUT = 0.01
LORA_R = 16
LORA_TARGET_MODULES = [
    "q",
    "v",
]
BATCH_SIZES = [8,16,32]
LEARNING_RATE_MIN = 1e-6
LEARNING_RATE_MAX = 1e-4
WEIGHT_DECAY = [0.01, 0.02, 0.03, 0.04, 0.05]
NUM_EXPERIMENTS = 15
PATIENCE = 3
TOP_K = 30
TOP_P = 0.95

In [None]:
device = 'cuda'

In [None]:
dataset = load_dataset("Dataset...", split = 'train')

In [None]:
dataset = dataset.shuffle(seed = 42)
dataset = dataset.train_test_split(test_size = 0.2)
training_set = dataset["train"]
auxillary_set = dataset["test"].train_test_split(test_size = 0.5)
validation_set = auxillary_set["train"]
test_set = auxillary_set["test"]

In [None]:
training_set = training_set.shuffle(seed = 42)
validation_set = validation_set.shuffle(seed = 42)
test_set = test_set.shuffle(seed = 42)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def preprocess_examples(examples):
  input_sql = [example for example in examples['Instruction']]
  output_gql = [example for example in examples['Output']]
  model_inputs = tokenizer(input_sql, text_target=output_gql, max_length=128, truncation=True)
  return model_inputs

In [None]:
tokenized_training_set = training_set.map(preprocess_examples, batched=True)
tokenized_validation_set = validation_set.map(preprocess_examples, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='Salesforce/codet5-small')

In [None]:
config = LoraConfig(
    peft_type = "LORA",
    r = LORA_R,
    lora_alpha = LORA_ALPHA,
    target_modules = LORA_TARGET_MODULES,
    lora_dropout = LORA_DROPOUT,
    bias = "none",
    task_type = "SEQ_2_SEQ_LM",
)

In [None]:
base_model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-small").to(device)
model = get_peft_model(base_model, config)
model.print_trainable_parameters()

In [None]:
sweep_config = {
    'method':'random'
}
parameters_dict = {
    'batch_size':{
        'values': BATCH_SIZES
    },
    'learning_rate':{
        'distribution':'log_uniform_values',
        'min': MIN_LEARNING_RATE,
        'max':MAX_LEARNING_RATE
    },
    'weight_decay':{
        'values':WEIGHT_DECAY
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
import wandb
wandb.login(key = 'key')
sweep_id = wandb.sweep(sweep_config, project='project-name')

In [None]:
early_stop = EarlyStoppingCallback(PATIENCE)

In [None]:
def train(config=None):
  with wandb.init(config=config):
    config = wandb.config

    training_args = Seq2SeqTrainingArguments(
      report_to = 'wandb',
      output_dir ="./results",
      learning_rate = config.learning_rate,
      evaluation_strategy = "epoch",
      per_device_train_batch_size = config.batch_size,
      per_device_eval_batch_size = 8,
      weight_decay = config.weight_decay,
      save_total_limit = 3,
      num_train_epochs = 150,
      logging_strategy = "epoch",
      logging_steps = 1,
      fp16 = True,
      save_strategy = "epoch",
      save_steps = 1,
      metric_for_best_model = "eval_loss",
      load_best_model_at_end = True
    )

    trainer = Seq2SeqTrainer(
        model = model,
        args = training_args,
        train_dataset = tokenized_training_set,
        eval_dataset = tokenized_validation_set,
        tokenizer = tokenizer,
        data_collator = data_collator,
        callbacks = [early_stop]
    )

    trainer.train()


In [None]:
wandb.agent(sweep_id, train, count = NUM_EXPERIMENTS)

In [None]:
idx = "Some Random index into the test set"
inputs = tokenizer(test_set[idx]['Instruction'], max_length = 128, truncation = True, return_tensors = "pt").input_ids.to(device)
outputs = model.generate(input_ids = inputs, max_new_tokens = 40, do_sample = True, top_k = TOP_K, top_p = TOP_P)
tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!huggingface-cli login

In [None]:
model.push_to_hub("Some name...")
tokenizer.push_to_hub("Some name...)