In [5]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [6]:
# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [7]:
# Define a simple WikiText dataset class
class WikiTextDataset(Dataset):
    def __init__(self, file_path, seq_length):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = text.split()
        self.vocab = {word: idx for idx, word in enumerate(set(tokens))}
        self.data = [self.vocab[word] for word in tokens]
        self.seq_length = seq_length
    
    def __len__(self):
        return len(self.data) - self.seq_length
    
    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx + self.seq_length], dtype=torch.long)
        y = torch.tensor(self.data[idx + 1:idx + self.seq_length + 1], dtype=torch.long)
        return x, y

In [8]:
# Paths to the dataset files
train_file = '../data/wikitext/train.txt'
valid_file = '../data/wikitext/valid.txt'
test_file = '../data/wikitext/test.txt'

In [9]:
# Parameters
seq_length = 30
batch_size = 32
learning_rate = 0.001
num_epochs = 5

In [10]:
# Load datasets
train_dataset = WikiTextDataset(train_file, seq_length)
valid_dataset = WikiTextDataset(valid_file, seq_length)
test_dataset = WikiTextDataset(test_file, seq_length)

In [11]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [18]:
# Define the Transformer-based language model
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, seq_length):
        super(TransformerLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_length, embedding_dim))
        encoder_layers = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dim_feedforward=hidden_dim, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        x = self.fc(x)
        return x

In [19]:
# Model parameters
vocab_size = len(train_dataset.vocab)
embedding_dim = 128
num_heads = 8
hidden_dim = 512
num_layers = 3

In [20]:
# Instantiate the model, loss function, and optimizer
model = TransformerLanguageModel(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, seq_length).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [23]:
# Training loop
model.train()
iterator = 0 
for epoch in range(num_epochs):
    total_loss = 0
    for batch, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs.view(-1, vocab_size), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        iterator+=1
        print(f"{iterator} | {len(train_loader)}")
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}')

1 | 64122
2 | 64122
3 | 64122
4 | 64122
5 | 64122
6 | 64122
7 | 64122
8 | 64122
9 | 64122
10 | 64122
11 | 64122
12 | 64122
13 | 64122
14 | 64122
15 | 64122
16 | 64122
17 | 64122
18 | 64122
19 | 64122
20 | 64122
21 | 64122
22 | 64122
23 | 64122
24 | 64122
25 | 64122
26 | 64122
27 | 64122
28 | 64122
29 | 64122
30 | 64122
31 | 64122
32 | 64122
33 | 64122
34 | 64122
35 | 64122
36 | 64122
37 | 64122
38 | 64122
39 | 64122
40 | 64122
41 | 64122
42 | 64122
43 | 64122
44 | 64122
45 | 64122
46 | 64122
47 | 64122
48 | 64122
49 | 64122
50 | 64122
51 | 64122
52 | 64122
53 | 64122
54 | 64122
55 | 64122
56 | 64122
57 | 64122
58 | 64122
59 | 64122
60 | 64122
61 | 64122
62 | 64122
63 | 64122
64 | 64122
65 | 64122
66 | 64122
67 | 64122
68 | 64122
69 | 64122
70 | 64122
71 | 64122
72 | 64122
73 | 64122
74 | 64122
75 | 64122
76 | 64122
77 | 64122
78 | 64122
79 | 64122
80 | 64122
81 | 64122
82 | 64122
83 | 64122
84 | 64122
85 | 64122
86 | 64122
87 | 64122
88 | 64122
89 | 64122
90 | 64122
91 | 64122
92 | 641

KeyboardInterrupt: 

In [16]:
# Evaluation function
def evaluate(loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            loss = criterion(outputs.view(-1, vocab_size), y.view(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
# Evaluate on the validation and test sets
valid_loss = evaluate(valid_loader)
test_loss = evaluate(test_loader)
print(f'Validation Loss: {valid_loss:.4f}')
print(f'Test Loss: {test_loss:.4f}')