In [2]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [3]:
import os
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup , TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.nn.utils.rnn import pad_sequence
import torch

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cuda'
# if torch.cuda.is_available():
#     device = 'cuda'

In [4]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
class QuotesDataset(Dataset):
    def __init__(self, file_path, tokenizer, end_of_text_token="<|endoftext|>"):
        super().__init__()

        self.quote_list = []
        self.end_of_text_token = end_of_text_token
        self.tokenizer = tokenizer

        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                quote_str = f"{line.strip()} {self.end_of_text_token}"
                self.quote_list.append(quote_str)

    def __len__(self):
        return len(self.quote_list)

    def __getitem__(self, item):
        return self.quote_list[item]
# Custom collate function to pad sequences
def collate_fn(batch):
    input_ids = [torch.tensor(tokenizer.encode(item)) for item in batch]
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    return input_ids_padded

In [10]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Paths to the dataset files
train_file_path = 'train2.txt'
valid_file_path = 'valid2.txt'
test_file_path = 'test2.txt'

# Load the datasets
train_dataset = QuotesDataset(train_file_path, tokenizer)
valid_dataset = QuotesDataset(valid_file_path, tokenizer)
test_dataset = QuotesDataset(test_file_path, tokenizer)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, collate_fn=collate_fn)

In [14]:
# Training parameters
EPOCHS = 3
LEARNING_RATE = 5e-5
WARMUP_STEPS = 500  # You can keep this as it doesn't affect training time significantly
BATCH_SIZE = 2
MAX_SEQ_LEN = 400
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

model.train()
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_quotes_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):

    print(f"EPOCH {epoch} started" + '=' * 30)

    for idx, quotes in enumerate(train_dataloader):

        quote_tens = quotes.to(device)

        # Skip sample if it's longer than MAX_SEQ_LEN
        if quote_tens.size()[1] > MAX_SEQ_LEN:
            continue

        # First quote sequence
        if not torch.is_tensor(tmp_quotes_tens):
            tmp_quotes_tens = quote_tens
            continue
        else:
            # Sequence too long
            if tmp_quotes_tens.size()[1] + quote_tens.size()[1] > MAX_SEQ_LEN:
                work_quotes_tens = tmp_quotes_tens
                tmp_quotes_tens = quote_tens
            else:
                # Add quote to sequence
                tmp_quotes_tens = torch.cat([tmp_quotes_tens, quote_tens[:, 1:]], dim=1)
                continue

        # Process sequence
        outputs = model(work_quotes_tens, labels=work_quotes_tens)
        loss, logits = outputs[:2]
        loss.backward()
        sum_loss += loss.detach().data

        proc_seq_count += 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0
            batch_count += 1
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 10:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0

    # Save the model after each epoch
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_quotes_{epoch}.pt"))

# Evaluate the model on the validation set
model.eval()
validation_loss = 0.0
for idx, quotes in enumerate(valid_dataloader):
    quote_tens = quotes.to(device)
    with torch.no_grad():
        outputs = model(quote_tens, labels=quote_tens)
        loss, logits = outputs[:2]
        validation_loss += loss.item()

print(f"Validation loss: {validation_loss / len(valid_dataloader)}")

sum loss 1967.8048095703125
sum loss 1917.9769287109375
sum loss 1748.870849609375
sum loss 1442.05712890625
sum loss 930.58740234375
sum loss 460.0806884765625
sum loss 220.7571563720703
sum loss 155.84884643554688
sum loss 133.4817657470703
sum loss 131.26492309570312
sum loss 124.9834976196289
sum loss 121.26736450195312
sum loss 116.33351135253906
sum loss 115.99312591552734
sum loss 109.31539154052734
sum loss 110.83807373046875
sum loss 104.39925384521484
sum loss 98.64334106445312
sum loss 101.19227600097656
sum loss 96.76641845703125
sum loss 94.550537109375
sum loss 95.84516906738281
sum loss 93.18386840820312
sum loss 90.11491394042969
sum loss 88.74517059326172
sum loss 89.97457122802734
sum loss 87.17593383789062
sum loss 86.93054962158203
sum loss 86.68843841552734
sum loss 84.2596206665039
sum loss 84.90476989746094
sum loss 86.72714233398438
sum loss 83.08370971679688
sum loss 83.32669830322266
sum loss 78.30046081542969
sum loss 78.4620590209961
sum loss 80.842765808105

In [16]:
import os
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

def choose_from_top(probs, n=5):
    """
    Choose the next token from the top n candidates
    """
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob)
    chosen_index = np.random.choice(n, 1, p=top_prob)
    token_id = ind[chosen_index][0]
    return int(token_id)

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'eos_token': '<|endoftext|>'})

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Load the fine-tuned model
MODEL_EPOCH = 2  # Change this to the epoch you want to load
models_folder = "trained_models"
model_path = os.path.join(models_folder, f"gpt2_quotes_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

# Generate quotes
quotes_output_file_path = f'generated_quotes_epoch_{MODEL_EPOCH}.txt'

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

if os.path.exists(quotes_output_file_path):
    os.remove(quotes_output_file_path)

with torch.no_grad():
    for quote_idx in range(1000):
        quote_finished = False
        cur_ids = torch.tensor(tokenizer.encode("QUOTE:")).unsqueeze(0).to(device)

        for i in range(100):
            outputs = model(cur_ids)
            logits = outputs.logits
            softmax_logits = torch.softmax(logits[0, -1], dim=0)  # Get the logits of the last token

            if i < 3:
                n = 20
            else:
                n = 3
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)  # Select the next token
            cur_ids = torch.cat([cur_ids, torch.ones((1, 1)).long().to(device) * next_token_id], dim=1)  # Add the token to the sequence

            if next_token_id == tokenizer.eos_token_id:
                quote_finished = True
                break

        if quote_finished:
            output_list = list(cur_ids.squeeze().to('cpu').numpy())
            output_text = tokenizer.decode(output_list)

            with open(quotes_output_file_path, 'a') as f:
                f.write(f"{output_text}\n\n")

print(f"Generated quotes saved to {quotes_output_file_path}")


Generated quotes saved to generated_quotes_epoch_2.txt
