In [1]:
%load_ext tensorboard

In [None]:
import PyPDF2
import torch
import tensorflow as tf
import datetime
from transformers import AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_checkpoint = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

dataset_file_name = 'ns_dataset.txt'




In [3]:
import os

# Create dataset
if not os.path.isfile(dataset_file_name)
    path_to_doc_folder = '../docs'
    with open(dataset_file_name, 'w') as dataset_file:
        for root, dirs, files in os.walk(path_to_doc_folder):
            for file_name in files:
                with open(os.path.join(root, file_name), 'rb') as pdf_file:
                    pdf_reader = PyPDF2.PdfReader(pdf_file)
                    for page_num in range(len(pdf_reader.pages)):
                        dataset_file.write(pdf_reader.pages[page_num].extract_text())
                    






In [None]:
from datasets import load_dataset

dataset = load_dataset("text", data_files=dataset_file_name, split='train')  #, "validation": dataset_file_name})
dataset = dataset.train_test_split(test_size=0.3)

dataset


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_datasets

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=128,
    num_proc=4,
)

lm_datasets

In [None]:
from transformers import AutoModelForCausalLM


model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
model.to('cuda')



In [7]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
from transformers import TrainingArguments, Trainer

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-wikitext2",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10.0
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
)

trainer.train()

In [None]:
trainer.evaluate()