In [1]:
import PyPDF2
import torch
from transformers import AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_checkpoint = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

dataset_file_name = 'ns_dataset.txt'




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os

# Create dataset
path_to_doc_folder = '../docs'
with open(dataset_file_name, 'w') as dataset_file:
    for root, dirs, files in os.walk(path_to_doc_folder):
        for file_name in files:
            with open(os.path.join(root, file_name), 'rb') as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                for page_num in range(len(pdf_reader.pages)):
                    dataset_file.write(pdf_reader.pages[page_num].extract_text())
                    






In [2]:
from datasets import load_dataset

dataset = load_dataset("text", data_files={"train": dataset_file_name, "validation": dataset_file_name})

dataset


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 18285
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 18285
    })
})

In [3]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


Map (num_proc=4): 100%|██████████| 18285/18285 [00:00<00:00, 86767.41 examples/s]
Map (num_proc=4): 100%|██████████| 18285/18285 [00:00<00:00, 91020.69 examples/s]


In [4]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4): 100%|██████████| 18285/18285 [00:00<00:00, 89022.67 examples/s]
Map (num_proc=4): 100%|██████████| 18285/18285 [00:00<00:00, 88477.12 examples/s]


In [5]:
from transformers import AutoModelForCausalLM


model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
model.to('cuda')



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [6]:
from transformers import TrainingArguments, Trainer

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-wikitext2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)


2024-10-05 14:45:20.030164: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-05 14:45:20.037729: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-05 14:45:20.045837: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-05 14:45:20.048387: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-05 14:45:20.055500: I tensorflow/core/platform/cpu_feature_guar

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,3.465703
2,3.705200,3.352556
3,3.705200,3.322689


TrainOutput(global_step=882, training_loss=3.6052229485544216, metrics={'train_runtime': 108.8446, 'train_samples_per_second': 64.716, 'train_steps_per_second': 8.103, 'total_flos': 230071788896256.0, 'train_loss': 3.6052229485544216, 'epoch': 3.0})