The whole code is original, the only thing we were referring to is Perplexity calculation from HuggingFace link below: https://huggingface.co/docs/transformers/en/perplexity

# 1. Dataset creation and preprocessing
First of all, we needed to compose a dataset of size suitable for our task and our computational resources availability.

Here we download the dataset of 70k books from HuggingFace repository of "manu/project_gutenberg" and choose only English books

In [None]:
from datasets import load_dataset

# Load the English utenberg dataset with streaming
ds = load_dataset("manu/project_gutenberg", split="en", streaming=True)

# Print the first sample to see the structure
print(next(iter(ds)))

### Create Raw Dataset
Create a new dataset of 200 books by saving them in chunks to avoid memeory overflow. Then saved the raw dataset in txt file called "combined_gutenberg_dataset_small.txt"

In [None]:
from tqdm import tqdm
# Specify the path to save the combined text file
output_file = "combined_gutenberg_dataset_small.txt"

# Initialize an empty string to store the entire text
combined_text = ""

# Use tqdm to display progress
with open(output_file, "w", encoding="utf-8") as f:
    for i, sample in enumerate(tqdm(ds, desc="Downloading & Processing Books", unit="books")):
        text = sample['text']
        combined_text += text + "\n\n"  # Adding newline for separation between books
        
        # Ыave in chunks to avoid memory issues
        if (i + 1) % 500 == 0:
            f.write(combined_text)
            combined_text = ""  # Clear the buffer to avoid memory overflow

        if (i + 1) == 200:
            break

    # Write remaining text to file
    f.write(combined_text)

print(f"\nAll books have been saved successfully to '{output_file}'!")

Here we load the raw dataset and then start preprocessing part

In [None]:
# Function to load text files
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

In [None]:
gutenberg_text = load_text("combined_gutenberg_dataset_small.txt")

Import and download nltk library functions for preprocessing

In [None]:
import nltk
import os
# Set a custom directory for NLTK data
nltk_data_path = os.path.abspath("./nltk_data")

os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)
nltk.download('punkt', download_dir=nltk_data_path)

# Verify the tokenizer works
tokens = nltk.word_tokenize("This is a test sentence.")
print(tokens)

### Preprocess and save the new dataset
Create the whole preprocessing function for pipeline

In [None]:
import re

def preprocess_text(data, min_length=5, max_length=100):
    # Fix incorrect line breaks to merge lines
    data = re.sub(r"\n([a-z])", r" \1", data)
    
    # Merge sentences within quotation marks that are split across multiple lines
    data = re.sub(r'(".*?)(\n\s*)(.*?")', r'\1 \3', data, flags=re.DOTALL)
    
    # Remove Project Gutenberg header and footer
    data = re.sub(r"(START OF THE PROJECT GUTENBERG EBOOK.*?\n)|(\*\*\*START OF.*?\*\*\*)", "", data, flags=re.IGNORECASE | re.DOTALL)
    data = re.sub(r"(End of the Project Gutenberg EBook.*?\n)|(\*\*\*END OF.*?\*\*\*)", "", data, flags=re.IGNORECASE | re.DOTALL)
    
    # Remove Project Gutenberg license info and URLs
    data = re.sub(r"http[s]?://\S+", "", data)  # Remove URLs
    data = re.sub(r"Project Gutenberg.*?License", "", data, flags=re.IGNORECASE | re.DOTALL)
    
    # Remove metadata (e.g. Title, Author, Release Date, etc.)
    data = re.sub(r"(Title:.*?(\n|$))", "", data, flags=re.IGNORECASE)
    data = re.sub(r"(Author:.*?(\n|$))", "", data, flags=re.IGNORECASE)
    data = re.sub(r"(Release Date:.*?(\n|$))", "", data, flags=re.IGNORECASE)
    data = re.sub(r"(Language:.*?(\n|$))", "", data, flags=re.IGNORECASE)
    data = re.sub(r"(Character set encoding:.*?(\n|$))", "", data, flags=re.IGNORECASE)
    
    # Remove transcriber's notes and footnotes
    data = re.sub(r"Transcriber's note:.*?(\n|$)", "", data, flags=re.IGNORECASE)
    data = re.sub(r"\[.*?\]", "", data)  # Remove inline footnotes or comments
    
    # Remove page numbers and extra line breaks
    data = re.sub(r"\b[0-9]+\b", "", data)
    data = re.sub(r"\n\s*\n", "\n", data)
    
    # Sentence Tokenization
    sentences = nltk.sent_tokenize(data)
    
    cleaned_sentences = []
    
    # Here we preprocess and filter each sentence to ensure higher quality
    for sentence in sentences:
        sentence = sentence.strip()
        
        # Remove lines that are likely headings or chapter titles
        if re.match(r"^(CHAPTER|ADDISON|[A-Z\s]+)$", sentence):
            continue
        
        # Remove sentences that are mostly uppercase or contain no alphanumeric characters
        if sentence.isupper() or not re.search(r"[a-zA-Z0-9]", sentence):
            continue
        
        # Remove extra spaces within sentences (NEW ADDITION)
        sentence = re.sub(r"\s+", " ", sentence)
        
        # Ensure the sentence starts with a capital letter
        if not sentence[0].isupper():
            continue
        
        # Remove sentences that are too short or too long
        if len(sentence.split()) < min_length or len(sentence.split()) > max_length:
            continue
        
        # Append the valid sentence to the list
        cleaned_sentences.append(sentence)
    
    return cleaned_sentences


Preprocess and save the preprocessed dataset of 200 Gutenberk books

In [None]:
gutenberg_sentences = preprocess_text(gutenberg_text)

In [None]:
from tqdm import tqdm

output_file = "preprocessed_gutenberg_sentences_small.txt"
gutenberg_combined = ""

with open(output_file, "w", encoding="utf-8") as f:
    with tqdm(total=len(gutenberg_sentences), desc="Saving Sentences", unit="sentence") as pbar:
        for i, sentence in enumerate(gutenberg_sentences):
            gutenberg_combined += sentence + "\n"

            # Write to file every 10,000 sentences to avoid memory issues
            if (i + 1) % 10000 == 0:
                f.write(gutenberg_combined)
                gutenberg_combined = ""  # Clear buffer after writing
                pbar.update(10000)  # Update progress bar

        if gutenberg_combined:
            f.write(gutenberg_combined)
            pbar.update(len(gutenberg_sentences) % 10000)

print(f"\nAll sentences have been successfully saved to {output_file}!")

### Load the preprocessed dataset and split into train and test
Here we load the dataset and then we clean it once again by removing some artifcats like special symbols. Also remove short sentence less than 10 words. Then we split and save it into train and test txt files.

In [None]:
# Path to ombined text file (e.g., Project Gutenberg books)
dataset_path = "../dataset_prep/data/preprocessed_gutenberg_sentences_small.txt"

In [None]:
# Read the text file
with open(dataset_path, 'r', encoding='utf-8') as file:
    data = file.read()

In [None]:
import re
# Split the text into sentences (assuming '\n' as sentence separator)
sentences = data.split('\n')

In [None]:
def clean_text(sentence):
    # Replace smart quotes with standard quotes
    sentence = sentence.replace("“", "\"").replace("”", "\"")
    sentence = sentence.replace("‘", "'").replace("’", "'")
    
    # Remove unwanted characters (keeping basic punctuation and letters)
    cleaned = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", sentence)
    
    # Remove extra spaces
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    return cleaned

In [None]:
# Clean sentences and filter out very short ones
cleaned_sentences = [clean_text(sentence) for sentence in sentences if len(sentence.strip()) > 0]

# Filter out very short sentences (less than 10 words)
filtered_sentences = [sentence for sentence in cleaned_sentences if len(sentence.split()) >= 10]

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into Train (95%) and Test (5%)
train_sentences, test_sentences = train_test_split(filtered_sentences, test_size=0.05, random_state=424)


In [None]:
# Save the split datasets for easy access later
with open("train.txt", "w", encoding='utf-8') as file:
    file.write('\n'.join(train_sentences))

with open("test.txt", "w", encoding='utf-8') as file:
    file.write('\n'.join(test_sentences))

# 2. Training and evaluation of models

### First of all we need to define evaluation function for BLEU, BERT and Perplexity

NOTE: BLEU score was not eventaully used in most models because initial evaluations showed that BLEU is completely not suitable for our project

In [None]:
# Import all libraries
import torch
import re
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model
from huggingface_hub import notebook_login

In [None]:
def calculate_perplexity(model, tokenizer, sentences, device="cpu"):
    model.to(device)
    model.eval()
    
    nll_sum = 0.0
    n_tokens = 0

    for sentence in tqdm(sentences, desc="Calculating Perplexity"):
        sentence = sentence.strip()
        if len(sentence) == 0:
            continue
        
        encodings = tokenizer(sentence, return_tensors="pt")
        input_ids = encodings.input_ids.to(device)

        max_length = model.config.max_position_embeddings
        stride = 512
        seq_len = input_ids.size(1)
        prev_end_loc = 0

        for begin_loc in range(0, seq_len, stride):
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc  # Number of tokens to predict
            input_chunk = input_ids[:, begin_loc:end_loc]
            
            target_chunk = input_chunk.clone()
            target_chunk[:, :-trg_len] = -100  # Masking tokens

            with torch.no_grad():
                outputs = model(input_chunk, labels=target_chunk)
                neg_log_likelihood = outputs.loss.item() * trg_len  # Total NLL for this chunk

            # Count valid tokens
            num_valid_tokens = (target_chunk != -100).sum().item()
            nll_sum += neg_log_likelihood
            n_tokens += num_valid_tokens

            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

    # Calculate final perplexity
    avg_nll = nll_sum / n_tokens  # Average Negative Log-Likelihood
    perplexity = torch.exp(torch.tensor(avg_nll))
    return perplexity.item()


In [None]:
def calculate_bleu_score(model, tokenizer, sentences, device="cpu", prompt_length=8, max_length=100):
    model.to(device)
    model.eval()
    
    total_bleu = 0.0
    smoothing = SmoothingFunction().method1
    num_samples = len(sentences)
    
    # Initialize progress bar outside the loop
    progress_bar = tqdm(sentences, desc="Calculating BLEU Score", leave=True)

    for sentence in progress_bar:
        sentence = sentence.strip()
        if len(sentence) == 0:
            continue

        # Extract prompt based on a number of words, not characters
        words = sentence.split()
        
        if len(words) <= prompt_length:
            continue  # Skip sentences that are too short to provide a valid prompt

        prompt = ' '.join(words[:prompt_length])  # Take the first few words as the prompt
        actual_continuation = ' '.join(words[prompt_length:])  # The remaining words are the reference text

        # Encode the prompt
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                max_length=max_length,
                temperature=1,
                top_p=0.8,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode generated text and isolate the continuation
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

        # Remove the prompt part from the generated text
        if generated_text.startswith(prompt):
            generated_continuation = generated_text[len(prompt):].strip()
        else:
            generated_continuation = generated_text

        # Tokenize both reference and generated text for BLEU calculation
        reference_tokens = actual_continuation.lower().split()
        generated_tokens = generated_continuation.lower().split()

        # Calculate BLEU score for the current sentence
        if len(reference_tokens) > 0 and len(generated_tokens) > 0:
            bleu = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing)
            total_bleu += bleu

    avg_bleu = total_bleu / num_samples
    return avg_bleu

In [None]:
import evaluate 

def calculate_bertscore(model, tokenizer, sentences, device="cpu", prompt_length=8, max_new_tokens=50):
    model.to(device)
    model.eval()

    references = []
    predictions = []

    progress_bar = tqdm(sentences, desc="Calculating BERTScore", leave=True)

    for sentence in progress_bar:
        sentence = sentence.strip()
        if len(sentence) == 0:
            continue

        words = sentence.split()
        if len(words) <= prompt_length:
            continue  # skip too-short examples

        prompt = ' '.join(words[:prompt_length])
        actual_continuation = ' '.join(words[prompt_length:])

        # Encode and generate
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                max_new_tokens=max_new_tokens,
                temperature=1,
                top_p=0.8,
                pad_token_id=tokenizer.eos_token_id
            )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

        # Remove the prompt part
        if generated_text.startswith(prompt):
            generated_continuation = generated_text[len(prompt):].strip()
        else:
            generated_continuation = generated_text.strip()

        references.append(actual_continuation)
        predictions.append(generated_continuation)

    bertscore = evaluate.load("bertscore")

    results = bertscore.compute(
        predictions=predictions,
        references=references,
        lang="en",
        batch_size=4,
        model_type="microsoft/deberta-xlarge-mnli"
    )

    avg_p = sum(results["precision"]) / len(results["precision"])
    avg_r = sum(results["recall"]) / len(results["recall"])
    avg_f1 = sum(results["f1"]) / len(results["f1"])

    return avg_p, avg_r, avg_f1

This one function is to generate text for qualitative evaluations

In [None]:
def generate_text(model, tokenizer, prompt, max_new_tokens=50, temperature=1, top_p=0.8):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

## 2.1 Training GPT-2 Full-Finetuning

In [None]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# ensure we have a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

We need to split train data to training and validation set

In [None]:
dataset = load_dataset("text", data_files={"all": "train.txt"})["all"]
train_texts, val_texts = train_test_split(dataset["text"],
                                          test_size=0.1,
                                          random_state=182)
dataset_dict = DatasetDict({
    "train": Dataset.from_dict({"text": train_texts}),
    "validation": Dataset.from_dict({"text": val_texts}),
})

Tokenization step

In [None]:
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_attention_mask=True
    )

tokenized = dataset_dict.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)
tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask"],
)

Define training arguments

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir="./outputs/finetuned_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,   
    learning_rate=5e-4,              
    save_steps=5000,
    save_total_limit=2,
    logging_steps=200,
    eval_strategy="steps",        
    eval_steps=5000,
    fp16=True,
    report_to="none",
)

Train the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

Evaluation of the model with test data: Perplexity

In [None]:

checkpoint_dir = ".../outputs/finetuned_gpt2/checkpoint-38465"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
model = AutoModelForCausalLM.from_pretrained(checkpoint_dir)
model.eval().cuda()

# 2) Tokenize your validation data
max_length = 128
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length"
    )

ds = load_dataset("text", data_files={"validation": "test.txt"})
tokenized_val = ds["validation"].map(tokenize_fn, batched=True, remove_columns=["text"])

# 3) Create DataLoader
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataloader = DataLoader(tokenized_val, batch_size=16, collate_fn=data_collator)

# 4) Manually evaluate and compute perplexity
losses = []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Evaluating"):
        batch = {k: v.cuda() for k, v in batch.items()}
        outputs = model(**batch)
        losses.append(outputs.loss.item())

# 5) Compute perplexity
avg_loss = sum(losses) / len(losses)
perplexity = math.exp(avg_loss)
print(f"Manual evaluation perplexity: {perplexity:.2f}")

BERT, BLEU and PPL scores Calculation

In [None]:
with open("test.txt", "r", encoding="utf-8") as f:
    test_sentences = f.readlines()

In [None]:
bert_score = calculate_bertscore(model, tokenizer, test_sentences[:350], device="cuda", prompt_length=8)

In [None]:
print(f"BERT: {bert_score:.2f}")

In [None]:
perplexity_score = calculate_perplexity(model, tokenizer, test_sentences, device="cuda")

In [None]:
print(f"Perplexity: {perplexity_score:.2f}")

In [None]:
bleu_score = calculate_bleu_score(model, tokenizer, test_sentences[:350], device="cuda")

In [None]:
print(f"BLEU: {bleu_score :.2f}")

Sentences generation

In [None]:
prompt = "When inquisitors punish heretics it is not with the desire to"
reference = "When inquisitors punish heretics it is not with the desire to destroy them, but that they shall be converted and live."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The most personally courageous become bullies and"
reference = "The most personally courageous become bullies and the terror of the community."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of"
reference = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of paddle in the stream the birds sang lustily amid the gentle rustle of the garden trees, and when the thrush retired to roost the nightingale took up the tale."

generated_sentence = generate_text(model, tokenizer, prompt)

print(f"Prompt: {prompt}")
print(f"Generated: {generated_sentence}")
print(f"Reference: {reference}")

## 2.2 Training GPT-NEO Full-Finetuning

In [None]:
model_name = "EleutherAI/gpt-neo-125M"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))

Dataset preeparation and Tokenization parts are similiar with GPT-2

Defining training arguments

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments.
training_args = TrainingArguments(
    output_dir="./gptneo_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16, 
    learning_rate=5e-4,                          
    save_steps=5000,
    save_total_limit=2,
    logging_steps=200,
    eval_strategy="epoch",
    fp16=True,
    report_to="none",
)

## 2.3. Train GPT-2 with LoRA

In [None]:
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model, TaskType

tokenized_datasets = load_from_disk("tokenized_datasets")

Define LoRa configurations

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],  # Targeting the right layers for GPT-2
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
for param in model.base_model.parameters():
    param.requires_grad = False

# Unfreeze only the LoRA adapter parameters
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True  # Keep adapters trainable

Tokenization step

In [None]:
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask"])

Define Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/distilgpt2-lora",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=5000,
    save_total_limit=2,
    logging_steps=200,
    eval_strategy="epoch",
    learning_rate=5e-4,
    fp16=True,  # Use mixed precision
    push_to_hub=False,  # Automatically pushes checkpoints to the Hub
    report_to="none"
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

Train GPT-Neo with LoRa

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

trainer.train()

Evaluation by calculating score for quantitative results

In [None]:
bert_score = calculate_bertscore(model, tokenizer, test_sentences[:350], device="cuda", prompt_length=8)

In [None]:
print(f"BERT: {bert_score:.2f}")

In [None]:
perplexity_score = calculate_perplexity(model, tokenizer, test_sentences, device="cuda")

In [None]:
print(f"Perplexity: {perplexity_score:.2f}")

In [None]:
bleu_score = calculate_bleu_score(model, tokenizer, test_sentences[:350], device="cuda")

In [None]:
print(f"BLEU: {bleu_score :.2f}")

Getting genertaed sentences for qualitative results

In [None]:
prompt = "When inquisitors punish heretics it is not with the desire to"
reference = "When inquisitors punish heretics it is not with the desire to destroy them, but that they shall be converted and live."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The most personally courageous become bullies and"
reference = "The most personally courageous become bullies and the terror of the community."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of"
reference = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of paddle in the stream the birds sang lustily amid the gentle rustle of the garden trees, and when the thrush retired to roost the nightingale took up the tale."

generated_sentence = generate_text(model, tokenizer, prompt)

print(f"Prompt: {prompt}")
print(f"Generated: {generated_sentence}")
print(f"Reference: {reference}")

## 2.4. Train GPT-Neo with LoRA

In [None]:
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

LoRa configuration for GPT-Neo

In [None]:
lora_config = LoraConfig(
  r=8,
  lora_alpha=32,
  target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
  lora_dropout=0.1,
  bias="none",
  task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)

Tokenization step is similar to GPT2 LoRA

In [None]:
training_args = TrainingArguments(
    output_dir="./gptneo-lora",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=5000,
    save_total_limit=2,
    logging_steps=200,
    eval_strategy="epoch",
    learning_rate=5e-4,
    fp16=True,  # Use mixed precision
    push_to_hub=False,  # Automatically pushes checkpoints to the Hub
    report_to="none"
)
data_collator = DataCollatorForLanguageModeling(
  tokenizer=tokenizer, mlm=False
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_datasets["train"],
  eval_dataset= tokenized_datasets["validation"],
  data_collator=data_collator,
)

trainer.train()

Evaluation part

Calculate quantitaive scores

In [None]:
bert_score = calculate_bertscore(model, tokenizer, test_sentences[:350], device="cuda", prompt_length=8)

In [None]:
print(f"BERT: {bert_score:.2f}")

In [None]:
perplexity_score = calculate_perplexity(model, tokenizer, test_sentences, device="cuda")

In [None]:
print(f"Perplexity: {perplexity_score:.2f}")

In [None]:
bleu_score = calculate_bleu_score(model, tokenizer, test_sentences[:350], device="cuda")

In [None]:
print(f"BLEU: {bleu_score :.2f}")

Get Qualitative Results by text generation

In [None]:
prompt = "When inquisitors punish heretics it is not with the desire to"
reference = "When inquisitors punish heretics it is not with the desire to destroy them, but that they shall be converted and live."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The most personally courageous become bullies and"
reference = "The most personally courageous become bullies and the terror of the community."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of"
reference = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of paddle in the stream the birds sang lustily amid the gentle rustle of the garden trees, and when the thrush retired to roost the nightingale took up the tale."

generated_sentence = generate_text(model, tokenizer, prompt)

print(f"Prompt: {prompt}")
print(f"Generated: {generated_sentence}")
print(f"Reference: {reference}")

## 2.5 Train Distilled Deepseek-R1 with LoRA

Load the DeepSeek-R1-Distill-Qwen-1.5B model from Hugging Face

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Define bnb config for a bit optimized training because the model is too large

In [None]:
# 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Example of how pre-trained models were evaluated

In [None]:
# Select subset for quick evaluation
eval_samples = [x['text'] for x in test_dataset]

In [None]:
print("Pre-Training Metrics")
base_perplexity = calculate_perplexity(model, tokenizer, eval_samples, device="cuda")

In [None]:
print(f"Base Model Perplexity: {base_perplexity:.2f}")

In [None]:
base_bleu = calculate_bleu_score(model, tokenizer, eval_samples[:350], device="cuda")

In [None]:
print(f"Base Model BLEU: {base_bleu:.2f}")

In [None]:
base_bert = calculate_bertscore(model, tokenizer, test_sentences[:350], device="cuda", prompt_length=8)

In [None]:
print(f"Base Model BERT: {base_bert:.2f}")

Here we printed generation results for quantitaive analysis

In [None]:
prompt = "When inquisitors punish heretics it is not with the desire to"
reference = "When inquisitors punish heretics it is not with the desire to destroy them, but that they shall be converted and live."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The most personally courageous become bullies and"
reference = "The most personally courageous become bullies and the terror of the community."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of"
reference = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of paddle in the stream the birds sang lustily amid the gentle rustle of the garden trees, and when the thrush retired to roost the nightingale took up the tale."

generated_sentence = generate_text(model, tokenizer, prompt)

print(f"Prompt: {prompt}")
print(f"Generated: {generated_sentence}")
print(f"Reference: {reference}")

Here defined LoRA config as for other models above

In [None]:
# LoRA setup
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

Here is example of freezing layer

In [None]:
# Manually enable LoRA params if needed
for name, param in model.named_parameters():
    if param.requires_grad is False and "lora" in name:
        param.requires_grad = True

In [None]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples['text'],
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [None]:
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_test = test_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

Training part

In [None]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/distil-deepseek-lora",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    eval_strategy="epoch",
    logging_steps=200,
    learning_rate=2e-4,
    fp16=True,
    save_steps=1000,
    optim="paged_adamw_8bit",
    report_to="none",
    remove_unused_columns=False,
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

In [None]:
trainer.train()

Saving model in Hugging Face for safety, explained lower in the code

In [None]:
# Save model
trainer.save_model("/kaggle/working/final_model")

In [None]:
# Login to Hugging Face
notebook_login()

In [None]:
# Push model
model.push_to_hub("dudessa/deepseek-1.5b-gutenberg-lora")
tokenizer.push_to_hub("dudessa/deepseek-1.5b-gutenberg-lora")

This part of the code was used to continue model training after saving it in hugging face. This step was essential to not lose training results after very long training sessions, as online notebooks tend to crash after several hours of training

In [None]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
# Then load your LoRA adapter into it
model = PeftModel.from_pretrained(base_model, "dudessa/deepseek-1.5b-gutenberg-lora")

tokenizer = AutoTokenizer.from_pretrained("dudessa/deepseek-1.5b-gutenberg-lora")
tokenizer.pad_token = tokenizer.eos_token

Evaluation after fine-tuning

In [None]:
# Select subset for quick evaluation
eval_samples = [x['text'] for x in test_dataset]

In [None]:
print("Post-Training Metrics")
post_perplexity = calculate_perplexity(model, tokenizer, eval_samples, device="cuda")

In [None]:
print(f"Post Model Perplexity: {post_perplexity:.2f}")

In [None]:
post_bleu = calculate_bleu_score(model, tokenizer, eval_samples[:350], device="cuda")

In [None]:
print(f"Post Model BLEU: {post_bleu:.2f}")

In [None]:
post_bert = calculate_bertscore(model, tokenizer, test_sentences[:350], device="cuda", prompt_length=8)

In [None]:
print(f"Post Model BERT: {post_bert:.2f}")

In [None]:
prompt = "When inquisitors punish heretics it is not with the desire to"
reference = "When inquisitors punish heretics it is not with the desire to destroy them, but that they shall be converted and live."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The most personally courageous become bullies and"
reference = "The most personally courageous become bullies and the terror of the community."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of"
reference = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of paddle in the stream the birds sang lustily amid the gentle rustle of the garden trees, and when the thrush retired to roost the nightingale took up the tale."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

## 2.6 Training DistilGPT-2 with LoRA

Loading DistilGPT-2 model and its tokenizer as for other models before

In [None]:
# Load pre-trained distilgpt2 model and tokenizer
model_name = "distilgpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model.eval()

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

In [None]:
tokenizer.pad_token = tokenizer.eos_token 

Tokenize dataset

In [None]:
# Apply tokenization to the datasets
tokenized_datasets = dataset_dict.map(tokenize, batched=True)

Define LoRA config for DistilGPT-2 again as for other models

In [None]:
# Define the adapter configuration (LoRA)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

Freezing layers

In [None]:
# Add adapters to the model
model = get_peft_model(model, lora_config)

# Freeze all parameters in the base model
for param in model.base_model.parameters():
    param.requires_grad = False

# Unfreeze only the LoRA adapter parameters
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True  # Keep adapters trainable

Training part of the model with Trainer

In [None]:
# Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training Arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/distilgpt2-lora",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=5000,
    save_total_limit=2,
    logging_steps=200,
    eval_strategy="epoch",
    learning_rate=5e-4,
    fp16=True, 
    push_to_hub=False,
    report_to="none"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

Save model locally and in Hugging Face

In [None]:
model.save_pretrained("/kaggle/working/distilgpt2-lora")
tokenizer.save_pretrained("/kaggle/working/distilgpt2-lora")

In [None]:
from huggingface_hub import HfApi

model_name = "dudessa/distilgpt2-lora-finetuned"  # Replace with your model name
api = HfApi()

# Create a new repository (if it doesn't exist already)
api.create_repo(repo_id=model_name, exist_ok=True)

# Push the model to the hub
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

Evaluate the model

In [None]:
ppl_score = calculate_perplexity(model, tokenizer, test_sentences)
print(f"Perplexity: {ppl_score}")

In [None]:
bleu_score = calculate_bleu_score(model, tokenizer, test_sentences[:350]) 
print(f"BLEU Score: {bleu_score}")

In [None]:
bert_score = calculate_bertscore(model, tokenizer, test_sentences[:350], device="cuda", prompt_length=8)
print(f"Model BERT: {bert_score:.2f}")

In [None]:
prompt = "When inquisitors punish heretics it is not with the desire to"
reference = "When inquisitors punish heretics it is not with the desire to destroy them, but that they shall be converted and live."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The most personally courageous become bullies and"
reference = "The most personally courageous become bullies and the terror of the community."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")

In [None]:
prompt = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of"
reference = "The whisper of the wind that stirred the willows made soft accompaniment of the splash of paddle in the stream the birds sang lustily amid the gentle rustle of the garden trees, and when the thrush retired to roost the nightingale took up the tale."
generated_sentence = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_sentence}")
print(f"Reference Text: {reference}")