In [1]:
!pip install transformers torch tqdm nltk 
!pip install -U scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m120.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
with open('/content/drive/MyDrive/text.txt', 'r') as file:
    text = file.read()



In [5]:
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split
# Split the text into sentences
sentences = nltk.tokenize.sent_tokenize(text)

# Split sentences into training and validation sets
train_sentences, val_sentences = train_test_split(sentences, test_size=0.2, random_state=42)

print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of validation sentences: {len(val_sentences)}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Number of training sentences: 9968
Number of validation sentences: 2492


In [6]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Define the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

EPOCHS = 10  # Replace with the number of epochs you want to train for

# Define the PyTorch Dataset
class ShakespeareDataset(Dataset):
    def __init__(self, sentences, tokenizer):
        self.sentences = sentences
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        inputs = self.tokenizer(sentence, return_tensors='pt', truncation=True, padding='max_length', max_length=100)
        input_ids = inputs["input_ids"].squeeze().to(device)
        attention_mask = inputs["attention_mask"].squeeze().to(device)
        return {"input_ids": input_ids, "attention_mask": attention_mask}

# Setup GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define the PyTorch DataLoaders
train_dataset = ShakespeareDataset(train_sentences, tokenizer)
val_dataset = ShakespeareDataset(val_sentences, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=EPOCHS*len(train_dataloader))

# Training loop
for epoch in range(EPOCHS):
    # Training
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}", position=0, leave=True)
    for batch in progress_bar:
        optimizer.zero_grad()
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Update the progress bar
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    # Validation
    model.eval()
    total_val_loss = 0
    val_progress_bar = tqdm(val_dataloader, desc="Validation", position=0, leave=True)
    with torch.no_grad():
        for batch in val_progress_bar:
            outputs = model(**batch, labels=batch["input_ids"])
            loss = outputs.loss
            total_val_loss += loss.item()

            # Update the progress bar
            val_progress_bar.set_postfix({'validation_loss': '{:.3f}'.format(loss.item()/len(batch))})

    # Print losses
    print(f"Epoch: {epoch+1}")
    print(f"Training loss: {total_loss/len(train_dataloader)}")
    print(f"Validation loss: {total_val_loss/len(val_dataloader)}")

# Save the model
model.save_pretrained('/content/drive/MyDrive/gpt2_finetuned_shakespeare')



Epoch 1: 100%|██████████| 312/312 [01:07<00:00,  4.65it/s, training_loss=0.630]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.20it/s, validation_loss=0.544]


Epoch: 1
Training loss: 1.3295054842646306
Validation loss: 0.9318607235566164


Epoch 2: 100%|██████████| 312/312 [01:03<00:00,  4.90it/s, training_loss=0.545]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.39it/s, validation_loss=0.525]


Epoch: 2
Training loss: 0.9309176233334419
Validation loss: 0.9016876900807406


Epoch 3: 100%|██████████| 312/312 [01:03<00:00,  4.91it/s, training_loss=0.321]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.43it/s, validation_loss=0.514]


Epoch: 3
Training loss: 0.8857865495941578
Validation loss: 0.8874738629047687


Epoch 4: 100%|██████████| 312/312 [01:03<00:00,  4.91it/s, training_loss=0.226]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.40it/s, validation_loss=0.509]


Epoch: 4
Training loss: 0.8533925985296568
Validation loss: 0.8802453210720649


Epoch 5: 100%|██████████| 312/312 [01:03<00:00,  4.91it/s, training_loss=0.219]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.38it/s, validation_loss=0.508]


Epoch: 5
Training loss: 0.8237666098926312
Validation loss: 0.8778254382121258


Epoch 6: 100%|██████████| 312/312 [01:03<00:00,  4.91it/s, training_loss=0.475]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.38it/s, validation_loss=0.508]


Epoch: 6
Training loss: 0.799078297156554
Validation loss: 0.8785081574550042


Epoch 7: 100%|██████████| 312/312 [01:03<00:00,  4.91it/s, training_loss=0.518]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.28it/s, validation_loss=0.508]


Epoch: 7
Training loss: 0.7760350404259486
Validation loss: 0.8801791622088506


Epoch 8: 100%|██████████| 312/312 [01:03<00:00,  4.91it/s, training_loss=0.450]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.38it/s, validation_loss=0.510]


Epoch: 8
Training loss: 0.7558198089783008
Validation loss: 0.8830085007043985


Epoch 9: 100%|██████████| 312/312 [01:03<00:00,  4.91it/s, training_loss=0.398]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.46it/s, validation_loss=0.513]


Epoch: 9
Training loss: 0.7398923797867237
Validation loss: 0.8874471584955851


Epoch 10: 100%|██████████| 312/312 [01:03<00:00,  4.91it/s, training_loss=0.237]
Validation: 100%|██████████| 78/78 [00:05<00:00, 13.41it/s, validation_loss=0.514]


Epoch: 10
Training loss: 0.7278081828202957
Validation loss: 0.8907636098372631


In [13]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input):
        embedded = self.embedding(input)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out)
        return out


# Model parameters
VOCAB_SIZE = len(tokenizer.get_vocab())  # You need to define the tokenizer
EMBEDDING_DIM = 256
HIDDEN_DIM = 512

# Training parameters
EPOCHS = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64  # You may need to adjust this

# Prepare your data loader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Create the model, criterion, optimizer
lstm_model = LSTMModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(lstm_model.parameters(), lr=LEARNING_RATE)

# Start the training loop
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}")
    lstm_model.train()
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc="Training", position=0, leave=True)

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        labels = input_ids  # The labels are the input_ids itself
        optimizer.zero_grad()
        outputs = lstm_model(input_ids)
        loss = criterion(outputs.view(-1, VOCAB_SIZE), labels.view(-1))
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Training loss: {avg_train_loss}")

# Save the model
torch.save(lstm_model.state_dict(), "/content/drive/MyDrive/lstm_model.pth")


Epoch 1


Training: 100%|██████████| 156/156 [00:16<00:00,  9.62it/s, training_loss=0.224]


Training loss: 1.3835429749809778
Epoch 2


Training: 100%|██████████| 156/156 [00:15<00:00,  9.93it/s, training_loss=0.065]


Training loss: 0.25258838108334786
Epoch 3


Training: 100%|██████████| 156/156 [00:15<00:00,  9.91it/s, training_loss=0.031]


Training loss: 0.09916129777542292
Epoch 4


Training: 100%|██████████| 156/156 [00:15<00:00,  9.85it/s, training_loss=0.019]


Training loss: 0.04625307370980199
Epoch 5


Training: 100%|██████████| 156/156 [00:15<00:00,  9.78it/s, training_loss=0.010]


Training loss: 0.022789316347394235
Epoch 6


Training: 100%|██████████| 156/156 [00:16<00:00,  9.62it/s, training_loss=0.006]


Training loss: 0.010523830188247256
Epoch 7


Training: 100%|██████████| 156/156 [00:16<00:00,  9.70it/s, training_loss=0.002]


Training loss: 0.004356227504710357
Epoch 8


Training: 100%|██████████| 156/156 [00:16<00:00,  9.64it/s, training_loss=0.002]


Training loss: 0.002464224524700489
Epoch 9


Training: 100%|██████████| 156/156 [00:16<00:00,  9.63it/s, training_loss=0.001]


Training loss: 0.0016278049498886014
Epoch 10


Training: 100%|██████████| 156/156 [00:16<00:00,  9.64it/s, training_loss=0.001]


Training loss: 0.0011425493105959434
