<a href="https://colab.research.google.com/github/mohamed-ssafini/Transformers/blob/main/1_data_loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
print(f"Using device: {device}")

Using device: cpu


In [4]:
import requests

# URL of tiny-shakespeare.txt hosted online
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

# Make a GET request to fetch the file content
response = requests.get(url)

# Raise an error if the request was unsuccessful
response.raise_for_status()

# Get the text content of the file
text = response.text

# Now the variable 'text' holds the contents of 'tiny-shakespeare.txt'
print(text[:500])  # printing the first 500 characters to verify

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [None]:
print(text[0:500])

In [5]:
text[0:1000]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [6]:
import torch

class CharTokenizer:
  def __init__(self, vocabulary):
    self.id_for_char = {chr: id for id, chr in enumerate(vocabulary)}
    self.char_for_id = {id: chr for id, chr in enumerate(vocabulary)}

  @staticmethod
  def train_from_text(text):
    vocabulary = set(text)
    return CharTokenizer(vocabulary)

  def encode(self, text):
    ids = []
    for chr in text:
      ids.append(self.id_for_char[chr])
    return torch.tensor(ids, dtype=torch.long)

  def decode(self, ids):
    chars = []
    for id in ids.tolist():
      chars.append(self.char_for_id[id])
    return ''.join(chars)

  def vocabulary_size(self):
    return len(self.id_for_char)

In [7]:
tokenizer = CharTokenizer.train_from_text(text)

In [8]:
print(tokenizer.encode("Hello world"))
print(tokenizer.decode(tokenizer.encode("Hello world")))

tensor([24, 38, 63, 63, 13,  6, 14, 13, 17, 63, 53])
Hello world


In [9]:
print(f"Vocabulary size: {tokenizer.vocabulary_size()}")

Vocabulary size: 65


In [10]:
from torch.utils.data import Dataset

class IndexesDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # Ensure all sequences generated are complete by reducing length
        return len(self.data) - self.block_size

    def __getitem__(self, pos):
        assert pos < len(self.data) - self.block_size

        x = self.data[pos:pos + self.block_size]
        y = self.data[pos + 1:pos + 1 + self.block_size]
        return x, y

In [11]:
tokenized_text = tokenizer.encode(text)
dataset = IndexesDataset(tokenized_text, 64)

In [17]:
x,y = dataset[0]

In [18]:
x

tensor([ 3, 29, 17, 19, 51,  6, 60, 29, 51, 29, 64, 38, 33, 48,  4, 43, 38, 54,
        13, 17, 38,  6, 14, 38,  6, 39, 17, 13,  8, 38, 38, 53,  6,  2, 33, 35,
         6, 54, 61, 17, 51, 30, 38, 17, 36,  6, 30, 38,  2, 17,  6, 16, 38,  6,
        19, 39, 38,  2, 25, 62,  4,  4, 57, 63])

In [19]:
y

tensor([29, 17, 19, 51,  6, 60, 29, 51, 29, 64, 38, 33, 48,  4, 43, 38, 54, 13,
        17, 38,  6, 14, 38,  6, 39, 17, 13,  8, 38, 38, 53,  6,  2, 33, 35,  6,
        54, 61, 17, 51, 30, 38, 17, 36,  6, 30, 38,  2, 17,  6, 16, 38,  6, 19,
        39, 38,  2, 25, 62,  4,  4, 57, 63, 63])

In [20]:
tokenizer.decode(x)

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAl'

In [21]:
tokenizer.decode(y)

'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll'

In [12]:
from torch.utils.data import DataLoader, RandomSampler

sampler = RandomSampler(dataset, replacement=True)
dataloader = DataLoader(dataset, batch_size=2, sampler=sampler)

In [13]:
x, y = next(iter(dataloader))

In [14]:
x

tensor([[17,  6, 19, 30, 13, 61, 63, 53,  6, 30, 38, 17,  6, 10, 13, 53, 35,  6,
         19, 51, 13, 13, 39,  4, 58, 13,  6, 19, 61,  8, 30,  6,  2, 10, 30, 13,
         17, 17, 41, 53,  6, 39, 13, 63, 63, 61, 51, 29, 13, 33, 62,  4, 58, 30,
         38, 33, 36,  6, 49, 19,  2, 10, 38, 63],
        [35,  6, 30, 13, 63, 35,  6, 19, 29, 17, 36,  6, 33, 13, 33, 38,  6, 10,
         38, 51, 51, 38, 17,  6, 25, 33, 13, 14, 19,  6, 51, 30,  2, 33,  6, 35,
         13, 61,  4, 24, 13, 14,  6, 49,  6, 30,  2, 12, 38,  6, 38, 12, 38, 17,
          6, 63, 13, 12, 38, 53,  6, 51, 30, 38]])

In [15]:
tokenizer.decode(x[0])

"r should her body stoop\nTo such abhorr'd pollution.\nThen, Isabel"

In [16]:
tokenizer.decode(y[0])

" should her body stoop\nTo such abhorr'd pollution.\nThen, Isabel,"