In [1]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /user/pnekkala/.cache/huggingface/token
Login successful


In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

2023-05-11 12:33:33.470704: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-11 12:33:33.615869: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-11 12:33:36.627708: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /projects/academic/courses/cse546s23/pnekkala/anaconda3/envs/rl_tf/lib:/user/pnekkala/.

Streaming enabled for datasets

In [3]:
train_dataset = load_dataset('json', data_files='./train_gen.json', field='data', streaming=True)['train']
val_dataset = load_dataset('json', data_files='./val_gen.json', field='data', streaming=True)['train']
features = ['text']

In [4]:
model = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model)#, model_max_length=256)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"])

In [6]:
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=features)
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=features)

In [7]:
block_size = 32


def group_texts(examples):
    # Concatenate all texts.
    
    for token_sent in examples['input_ids']:
      token_sent.append(tokenizer.eos_token_id)     
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # result['labels'] = []
    # for i in range(0, total_length, block_size):
    #   result['labels'].append(concatenated_examples['input_ids'][i : i + block_size+1])
    #   if i+block_size == total_length-1:
    #     result['labels'][-1].append(tokenizer.eos_token)
    # result["labels"] = [concatenated_examples['input_ids'][i : i + block_size+1] for i in range(0, total_length, block_size)]
        
    
    return result

In [8]:
train_dataset = train_dataset.map(group_texts, batched=True)
train_dataset = train_dataset.filter(lambda x:len(x['attention_mask'])==block_size)
val_dataset = val_dataset.map(group_texts, batched=True)
val_dataset = val_dataset.filter(lambda x:len(x['attention_mask'])==block_size)

In [9]:
import evaluate
rouge = evaluate.load('rouge')
perplexity = evaluate.load('perplexity', module_type='metric', add_start_token=False)
bleu = evaluate.load('bleu')
# bleurt = evaluate.load('bleurt', module_type='metric', checkpoint='bleurt-tiny-128')
# bert = evaluate.load('bertscore')

In [10]:
import numpy as np

def preprocess_logits_for_metrics(preds, labels):
    preds = preds.argmax(-1).squeeze()
    return preds

def calculate_metrics(eval_pred):
  preds, refs= eval_pred
  refs[refs == -100] = tokenizer.eos_token_id
#   shifted_preds = preds[...,:-1]
#   shifted_refs = refs[...,1:]
#   print(preds.shape)
#   print(refs[(torch.tensor(refs) < 0).tolist()])
  # preds = np.argmax(preds, axis=-1).squeeze()
#   preds = tokenizer.batch_decode(preds.argmax(-1).squeeze(), skip_special_tokens=True)
  preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  refs = tokenizer.batch_decode(refs, skip_special_tokens=True)
  rouge_score = rouge.compute(predictions=preds, references=refs)
  perplexity_score = perplexity.compute(predictions=preds, model_id="gpt2")
  bleu_score = bleu.compute(predictions=preds, references=refs)
#   bleurt_score = bleurt.compute(predictions=preds, references=refs)
#   bert_score = bert.compute(predictions=preds, references=refs, model_type="distilbert-base-uncased")
  return {
      'rouge':rouge_score,
      'perplexity':perplexity_score['mean_perplexity'],
      'bleu':bleu_score
#       'bleurt':bleurt_score,
#       'bert':bert_score
  }

In [12]:
# from transformers.utils import logging

# logging.set_verbosity_info()

device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 512
training_args = TrainingArguments(
    output_dir="reddit_gen_final",
    overwrite_output_dir=True,
    max_steps = int(551668*1/batch_size),
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=4,
    logging_strategy="steps",
    save_strategy="no",
    evaluation_strategy="steps",
    gradient_accumulation_steps=32,
    logging_steps=320,
#     save_steps=320,
    eval_steps=320,
    fp16=True,
    fp16_full_eval=False,
    learning_rate=1e-3,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    disable_tqdm=False,
    push_to_hub=True,
    hub_strategy="end"
)

model = model.to(device)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=calculate_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)
trainer.train()

Cloning https://huggingface.co/sentientconch/reddit_gen_final into local empty directory.


Step,Training Loss,Validation Loss,Rouge,Perplexity,Bleu
320,3.9872,3.140657,"{'rouge1': 0.4236605668840905, 'rouge2': 0.17676647154634773, 'rougeL': 0.37369585565755137, 'rougeLsum': 0.38069226779493875}",1007.280263,"{'bleu': 0.1795660368641941, 'precisions': [0.4476203532341719, 0.1996849620846328, 0.12947506323794772, 0.097952801903731], 'brevity_penalty': 0.9786100068791068, 'length_ratio': 0.9788355449530342, 'translation_length': 131301, 'reference_length': 134140}"
640,3.0112,2.669343,"{'rouge1': 0.5006690963461402, 'rouge2': 0.2845737029774397, 'rougeL': 0.4598926127632702, 'rougeLsum': 0.46623659707701914}",891.638719,"{'bleu': 0.28259351848586683, 'precisions': [0.5153005174673647, 0.2977358252901072, 0.22869830241856198, 0.19400129812455164], 'brevity_penalty': 0.9838352619991267, 'length_ratio': 0.9839645146861488, 'translation_length': 131989, 'reference_length': 134140}"
960,2.5776,2.504953,"{'rouge1': 0.5318334167452433, 'rouge2': 0.3266503490464716, 'rougeL': 0.4940196552424935, 'rougeLsum': 0.49965823775029017}",810.216084,"{'bleu': 0.3233116246700081, 'precisions': [0.5456588886510291, 0.3399931653275477, 0.273607307447275, 0.2384403661808989], 'brevity_penalty': 0.9747575251310703, 'length_ratio': 0.975070821529745, 'translation_length': 130796, 'reference_length': 134140}"


Using pad_token, but it is not set yet.


  0%|          | 0/311 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'rouge1': 0.4236605668840905, 'rouge2': 0.17676647154634773, 'rougeL': 0.37369585565755137, 'rougeLsum': 0.38069226779493875}" of type <class 'dict'> for key "eval/rouge" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'bleu': 0.1795660368641941, 'precisions': [0.4476203532341719, 0.1996849620846328, 0.12947506323794772, 0.097952801903731], 'brevity_penalty': 0.9786100068791068, 'length_ratio': 0.9788355449530342, 'translation_length': 131301, 'reference_length': 134140}" of type <class 'dict'> for key "eval/bleu" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Using pad_token, but it is not set yet.


  0%|          | 0/311 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'rouge1': 0.5006690963461402, 'rouge2': 0.2845737029774397, 'rougeL': 0.4598926127632702, 'rougeLsum': 0.46623659707701914}" of type <class 'dict'> for key "eval/rouge" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'bleu': 0.28259351848586683, 'precisions': [0.5153005174673647, 0.2977358252901072, 0.22869830241856198, 0.19400129812455164], 'brevity_penalty': 0.9838352619991267, 'length_ratio': 0.9839645146861488, 'translation_length': 131989, 'reference_length': 134140}" of type <class 'dict'> for key "eval/bleu" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Using pad_token, but it is not set yet.


  0%|          | 0/311 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'rouge1': 0.5318334167452433, 'rouge2': 0.3266503490464716, 'rougeL': 0.4940196552424935, 'rougeLsum': 0.49965823775029017}" of type <class 'dict'> for key "eval/rouge" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'bleu': 0.3233116246700081, 'precisions': [0.5456588886510291, 0.3399931653275477, 0.273607307447275, 0.2384403661808989], 'brevity_penalty': 0.9747575251310703, 'length_ratio': 0.975070821529745, 'translation_length': 130796, 'reference_length': 134140}" of type <class 'dict'> for key "eval/bleu" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=1077, training_loss=3.112978822784282, metrics={'train_runtime': 38236.2752, 'train_samples_per_second': 922.975, 'train_steps_per_second': 0.028, 'total_flos': 5.76055061397504e+17, 'train_loss': 3.112978822784282, 'epoch': 63.01})

In [15]:
trainer.evaluate()

Using pad_token, but it is not set yet.


  0%|          | 0/311 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'rouge1': 0.5329842669787219, 'rouge2': 0.3281385117791451, 'rougeL': 0.49516084305471186, 'rougeLsum': 0.5007050542613292}" of type <class 'dict'> for key "eval/rouge" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'bleu': 0.32533667450429854, 'precisions': [0.5468573349222354, 0.3416679238691133, 0.2754205414341806, 0.24029617881063003], 'brevity_penalty': 0.9756131882587468, 'length_ratio': 0.9759057700909498, 'translation_length': 130908, 'reference_length': 134140}" of type <class 'dict'> for key "eval/bleu" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 2.4935691356658936,
 'eval_rouge': {'rouge1': 0.5329842669787219,
  'rouge2': 0.3281385117791451,
  'rougeL': 0.49516084305471186,
  'rougeLsum': 0.5007050542613292},
 'eval_perplexity': 788.5101907617036,
 'eval_bleu': {'bleu': 0.32533667450429854,
  'precisions': [0.5468573349222354,
   0.3416679238691133,
   0.2754205414341806,
   0.24029617881063003],
  'brevity_penalty': 0.9756131882587468,
  'length_ratio': 0.9759057700909498,
  'translation_length': 130908,
  'reference_length': 134140},
 'eval_runtime': 94.439,
 'eval_samples_per_second': 52.605,
 'eval_steps_per_second': 6.576,
 'epoch': 63.01}

In [14]:
# model.push_to_hub("reddit_gen_final")
# trainer.save_model(sentientconch/re)
trainer.push_to_hub()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/487M [00:00<?, ?B/s]

To https://user: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@huggingface.co/sentientconch/reddit_gen_final
   565d018..7cabe6b  main -> main

   565d018..7cabe6b  main -> main



In [2]:
from evaluate import evaluator
task_evaluator = evaluator("text-generation")

2023-05-11 23:47:33.462162: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-11 23:47:33.689382: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-11 23:47:37.203546: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /projects/academic/courses/cse546s23/pnekkala/anaconda3/envs/rl_tf/lib:/user/pnekkala/.