In [31]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-win_amd64.whl (635 kB)
     ---------------------------------------- 0.0/635.3 kB ? eta -:--:--
      --------------------------------------- 10.2/635.3 kB ? eta -:--:--
     -- ---------------------------------- 41.0/635.3 kB 393.8 kB/s eta 0:00:02
     ------ ----------------------------- 112.6/635.3 kB 819.2 kB/s eta 0:00:01
     ---------------- --------------------- 276.5/635.3 kB 1.5 MB/s eta 0:00:01
     ------------------------------ ------- 501.8/635.3 kB 2.2 MB/s eta 0:00:01
     --------------------------------- ---- 563.2/635.3 kB 2.1 MB/s eta 0:00:01
     -------------------------------------- 635.3/635.3 kB 2.1 MB/s eta 0:00:00
Collecting regex>=2022.1.18
  Downloading regex-2023.6.3-cp310-cp310-win_amd64.whl (268 kB)
     ---------------------------------------- 0.0/268.0 kB ? eta -:--:--
     ------------------------------------ --- 245.8/268.0 kB ? eta -:--:--
     -------------------------------------- 268.0

In [52]:
!pip install torch

Collecting torch
  Downloading torch-2.0.1-cp310-cp310-win_amd64.whl (172.3 MB)
     ---------------------------------------- 0.0/172.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/172.3 MB 1.3 MB/s eta 0:02:15
     -------------------------------------- 0.1/172.3 MB 787.7 kB/s eta 0:03:39
     ---------------------------------------- 0.1/172.3 MB 1.1 MB/s eta 0:02:39
     ---------------------------------------- 0.2/172.3 MB 1.1 MB/s eta 0:02:31
     ---------------------------------------- 0.3/172.3 MB 1.5 MB/s eta 0:01:58
     ---------------------------------------- 0.5/172.3 MB 2.0 MB/s eta 0:01:26
     ---------------------------------------- 0.6/172.3 MB 1.9 MB/s eta 0:01:29
     ---------------------------------------- 0.8/172.3 MB 2.3 MB/s eta 0:01:17
     ---------------------------------------- 1.0/172.3 MB 2.5 MB/s eta 0:01:08
     ---------------------------------------- 1.1/172.3 MB 2.4 MB/s eta 0:01:12
     ---------------------------------------- 1

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ultralytics 8.0.54 requires opencv-python>=4.6.0, which is not installed.
ultralytics 8.0.54 requires PyYAML>=5.3.1, which is not installed.
ultralytics 8.0.54 requires seaborn>=0.11.0, which is not installed.
ultralytics 8.0.54 requires sentry-sdk, which is not installed.
ultralytics 8.0.54 requires thop>=0.1.1, which is not installed.
ultralytics 8.0.54 requires torchvision>=0.8.1, which is not installed.
ultralytics 8.0.54 requires tqdm>=4.64.0, which is not installed.


# Creating a Generative Pretrained Autoencoder


In [56]:
import torch


# Our Data

In [57]:
import urllib.request
input_file, HTTP_reply = urllib.request.urlretrieve('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt','input.txt')
#print (HTTP_reply)

## Loading the Data
after obtaining the text file, we will load the raw data and use it to create our dataset

In [58]:
with open(input_file, 'r', encoding='utf-8') as f:
    raw_data = f.read()

## Playing With the Data


### trying a sample
let's print the first 10 lines of the dataset

In [60]:
head = ''
first_20_lines = raw_data.split('\n')[:20]
first_20_lines = [l+'\n' for l in first_20_lines]
head = head.join(first_20_lines)
print(head)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.



let's find out what characters appear in the text

In [61]:
chars = sorted(list(set(raw_data)))
print(''.join(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# Data processing

## Tokenizer
first we will use a very simple character level encoding-decoding scheme. later on we will examine the differences and benefits of using a more sophisticated tokenizer.

In [62]:
stoi = {c:i for i, c in enumerate(chars)}
itos = {i:c for i, c in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda t: ''.join([itos[i] for i in t])

lets verify the encoder correctness condition: $$ D(x) = E^{-1}(x) \rightarrow D(E(x)) = x $$

In [63]:
print(decode(encode("ahoy")))

ahoy


### another tokenizer

In [64]:
# import tiktoken
# enc = tiktoken.get_encoding("gpt2")
# assert enc.decode(enc.encode("hello world")) == "hello world"


### encode the data

In [66]:
data = torch.tensor(encode(raw_data))
print (data[:20])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56])


## Separating to Train and Validation Sets

In [110]:
N = len(data)
BLOCK_SIZE  = 40
BATCH_SIZE = 5
N_TRAIN = int(0.9*N)
train = data[:N_TRAIN]
val = data[N_TRAIN:]


## Dataloader

In [111]:
def get_batch(dataset,batch_size = BATCH_SIZE):
    n = len(dataset)
    indices = torch.randint(n - BLOCK_SIZE, (batch_size,))
    x = torch.stack([dataset[i : i + BLOCK_SIZE] for i in indices])
    y = torch.stack([dataset[i + BLOCK_SIZE +1 ]  for i in indices ])

    return x,y


In [112]:
batch = get_batch(train)
print(batch)

(tensor([[ 1, 20, 59, 52, 45, 43, 56, 44, 53, 56, 42,  8,  0,  0, 23, 21, 26, 19,
          1, 17, 16, 35, 13, 30, 16,  1, 21, 34, 10,  0, 13, 63,  6,  1, 61, 46,
         39, 58,  1, 53],
        [52, 45,  6,  0, 54, 56, 53, 59, 42,  6,  1, 60, 47, 53, 50, 43, 52, 58,
          6,  1, 58, 43, 57, 58, 63,  1, 51, 39, 45, 47, 57, 58, 56, 39, 58, 43,
         57,  6,  1, 39],
        [52, 43, 42,  1, 61, 47, 58, 46,  1, 58, 46, 43,  1, 46, 53, 54, 43,  1,
         58, 53,  1, 46, 39, 60, 43,  0, 32, 46, 43,  1, 54, 56, 43, 57, 43, 52,
         58,  1, 40, 43],
        [ 1, 58, 53,  1, 58, 46, 43,  1, 15, 39, 54, 59, 50, 43, 58, 57, 10,  0,
         30, 39, 47, 57, 43,  1, 59, 54,  1, 58, 46, 43,  1, 25, 53, 52, 58, 39,
         45, 59, 43, 57],
        [53, 60, 43, 56,  1, 39, 40, 53, 59, 58,  1, 46, 43, 56, 11,  1, 57, 39,
         63,  6,  1, 58, 46, 39, 58,  1, 56, 47, 45, 46, 58,  1, 44, 53, 56,  1,
         56, 47, 45, 46]]), tensor([ 1, 47, 43,  1,  0]))


In [114]:

d = ''
ys =batch[1].tolist()
print(ys)
for i,s in enumerate(batch[0]):
    d = decode(s.tolist()+[ys[i]])
    print(f'{i}. {d}\n')

[1, 47, 43, 1, 0]
0.  Hungerford.

KING EDWARD IV:
Ay, what o 

1. ng,
proud, violent, testy magistrates, ai

2. ned with the hope to have
The present bee

3.  to the Capulets:
Raise up the Montagues 

4. over about her; say, that right for righ




seems legit.