In [15]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import csv
import json
import random
import math
import numpy as np
from tqdm import tqdm
from google.colab import drive
import json
import torch.nn.functional as F
from tqdm import trange


# Mount Google Drive
drive.mount('/content/drive')

# Define the path to your JSON file
json_file_path = '/content/drive/MyDrive/Quote-Generation-ML/data/quotes.json'

def load_quotes(num_quotes):
    # Load the JSON file and select the first `num_quotes` quotes
    with open(json_file_path, 'r') as file:
        quotes_data = json.load(file)

    # Select the first `num_quotes` quotes
    quotes_data = quotes_data[:num_quotes]

    # Extract quotes and categories from the dataset
    quotes = [entry['quote'] for entry in quotes_data]
    categories = [entry['categories'] for entry in quotes_data]

    # Combine quotes and categories into a single input for the model
    inputs = [f"{quote} Categories: {', '.join(category)}" for quote, category in zip(quotes, categories)]

    return inputs

# Set up GPT-2 model and tokenizer
MODEL_NAME = 'distilgpt2'  # 'distilgpt2' or 'gpt2-medium'
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# Declare special tokens for padding and separating the context from the quote:
SPECIAL_TOKENS_DICT = {
    'pad_token': '<pad>',
    'additional_special_tokens': ['<context>', '<quote>'],
}

# Add these special tokens to the vocabulary and resize model's embeddings:
tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)
model.resize_token_embeddings(len(tokenizer))

# Create a custom dataset for quote generation
class QuoteDataset(Dataset):
    def __init__(self, quotes, tokenizer, seq_length=64):
        context_tkn = tokenizer.additional_special_tokens_ids[0]
        quote_tkn = tokenizer.additional_special_tokens_ids[1]
        pad_tkn = tokenizer.pad_token_id
        eos_tkn = tokenizer.eos_token_id

        self.examples = []
        for quote in quotes:
            # Build the context and quote segments:
            context = [context_tkn] + tokenizer.encode(quote, max_length=seq_length // 2 - 1)
            quote = [quote_tkn] + tokenizer.encode(quote, max_length=seq_length // 2 - 2) + [eos_tkn]

            # Concatenate the two parts together:
            tokens = context + quote + [pad_tkn] * (seq_length - len(context) - len(quote))

            # Annotate each token with its corresponding segment:
            segments = [context_tkn] * len(context) + [quote_tkn] * (seq_length - len(context))

            # Ignore the context, padding, and <quote> tokens by setting their labels to -100
            labels = [-100] * (len(context) + 1) + quote[1:] + [-100] * (seq_length - len(context) - len(quote))

            # Add the preprocessed example to the dataset
            self.examples.append((tokens, segments, labels))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item])

def fit(model, optimizer, train_dl, val_dl, epochs=1, device=torch.device('cpu')):
    for i in range(epochs):
        print('\n--- Starting epoch #{} ---'.format(i))
        model.train()
        losses = []
        nums = []

        for batch in tqdm(train_dl, desc="Training"):
            inputs = batch.to(device)
            outputs = model(inputs[:, 0, :], token_type_ids=inputs[:, 1, :], labels=inputs[:, 2, :])
            loss = outputs[0]
            losses.append(loss.item())
            nums.append(len(batch))

            loss.backward()
            optimizer.step()
            model.zero_grad()

        train_cost = np.sum(np.multiply(losses, nums)) / sum(nums)

        model.eval()
        with torch.no_grad():
            losses = []
            nums = []

            for batch in tqdm(val_dl, desc="Validation"):
                inputs = batch.to(device)
                outputs = model(inputs[:, 0, :], token_type_ids=inputs[:, 1, :], labels=inputs[:, 2, :])
                losses.append(outputs[0].item())
                nums.append(len(batch))

        val_cost = np.sum(np.multiply(losses, nums)) / sum(nums)

        print('\n--- Epoch #{} finished --- Training cost: {} / Validation cost: {}'.format(i, train_cost, val_cost))

def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    top_k = min(top_k, logits.size(-1))
    if top_k > 0:
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits

def sample_sequence(model, length, context, segments_tokens=None, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device='cpu'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context

    with torch.no_grad():
        for _ in trange(length):
            inputs = {'input_ids': generated}
            if segments_tokens is not None:
                inputs['token_type_ids'] = torch.tensor(segments_tokens[:generated.shape[1]]).unsqueeze(0).repeat(num_samples, 1)

            outputs = model(**inputs)
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty

            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0:
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated


# Number of quotes to load
num_quotes_to_load = 50000  # Change this number as needed

# Load quotes using the function
inputs = load_quotes(num_quotes_to_load)

# Build the dataset and display the dimensions of the 1st batch for verification:
quote_dataset = QuoteDataset(inputs, tokenizer)
print(next(iter(quote_dataset)).size())

# Create data indices for training and validation splits:
indices = list(range(len(quote_dataset)))
random.seed(42)
random.shuffle(indices)
split = math.floor(0.1 * len(quote_dataset))
train_indices, val_indices = indices[split:], indices[:split]

# Build the PyTorch data loaders:
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_loader = DataLoader(quote_dataset, batch_size=32, sampler=train_sampler)
val_loader = DataLoader(quote_dataset, batch_size=64, sampler=val_sampler)

# Move the model to the GPU:
device = torch.device('cuda')
model.to(device)

# Fine-tune GPT2 for two epochs:
optimizer = AdamW(model.parameters())
fit(model, optimizer, train_loader, val_loader, epochs=2, device=device)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


torch.Size([3, 64])

--- Starting epoch #0 ---


Training: 100%|██████████| 1407/1407 [08:05<00:00,  2.90it/s]
Validation: 100%|██████████| 79/79 [00:17<00:00,  4.47it/s]



--- Epoch #0 finished --- Training cost: 0.18449879344767994 / Validation cost: 0.020559473633766173

--- Starting epoch #1 ---


Training: 100%|██████████| 1407/1407 [08:08<00:00,  2.88it/s]
Validation: 100%|██████████| 79/79 [00:17<00:00,  4.48it/s]


--- Epoch #1 finished --- Training cost: 0.04173726387065318 / Validation cost: 0.015725482780486345





In [16]:

# Sample quotes using the trained model
context = "The purpose of"

context_tkn = tokenizer.additional_special_tokens_ids[0]
quote_tkn = tokenizer.additional_special_tokens_ids[1]

input_ids = [context_tkn] + tokenizer.encode(context)

segments = [quote_tkn] * 64
segments[:len(input_ids)] = [context_tkn] * len(input_ids)

input_ids += [quote_tkn]

# Move the model back to the CPU for inference:
model.to(torch.device('cpu'))

# Generate 10 samples of max length 20 with a higher repetition penalty
generated = sample_sequence(model, length=20, context=input_ids, segments_tokens=segments, num_samples=10, repetition_penalty=2.0)

print('\n\n--- Generated Quotes ---\n')

for g in generated:
    quote = tokenizer.decode(g.squeeze().tolist())
    print(quote)



100%|██████████| 20/20 [00:11<00:00,  1.81it/s]



--- Generated Quotes ---

<quote> The purpose of <context> the purposeian,manuel Houston Rockets Rockets Bullets bullets arrows ink tags hand markings flash snow clipped talk numb
<quote> The purpose of <context> the purposeful <quote>  results from September September autumn autumn Reynolds •• Sketchz z ZZzn Zombie
<quote> The purpose of <context> the purpose about<|endoftext|>people different from Timeline to make forever forever forever forever forever, Mama Mom booted shut
<quote> The purpose of <context> the purpose for<|endoftext|>that means meant meaning meaning meaning reason which later next season Conference instructswards Mayo
<quote> The purpose of <context> the reason why why why why why why why why why why why why why why why why Why why
<quote> The purpose of <context> the purpose, between both either either neither neither neither neither either neither neither neither misunderstand pity if before then
<quote> The purpose of <context> the purpose most important <conte


