# Import Requirements

In [56]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# Import Data

In [37]:
# Read in tiny shakespeare dataset

with open(r"C:\Users\logan\Documents\OneDrive BackUp\My Personal Stuff\Github Repos\personal_projects\minGPT_clone\tinyshakespeare.txt",'r',encoding='utf-8') as f:
    text = f.read()


# Explore Data

In [38]:
#  Number of Characters in the dataset

print(f"Length of dataset in characters: {str(len(text))}")



Length of dataset in characters: 1115393


In [39]:
# First 100 characters

print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [58]:
# Show unique characters in corpus

chars = sorted(list(set(text)))

vocab_size = len(chars)

print("Unique characters in the corpus: ")
print(''.join(chars))

Unique characters in the corpus: 

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# Mapping Characters to Integers (Creating an Encoder & Decoder)


#### Since this is a character-level language model, we will encode each individual character as an integer (as opposed to a token-level or sentence-level lanaguage model)

In [41]:
# create the mapping from characters to integers

str_to_int = { ch:i for i,ch in enumerate(chars) }   # for each unique character, assign it to an integer value using enumerate()
int_to_str = { i:ch for i,ch in enumerate(chars) }   # reverse mapping to assign each integer value back to the unique character assigned to it 

# Encode a string to a list of integers
# -- for each character (c) in a given string (s), find the value at index c in the str_to_int mapping
encode = lambda s: [str_to_int[c] for c in s]    

# Decode a list of integers back to a string
# -- for each integer (i) in a list of int (l), find the value at index i in the int_to_str mapping
decode = lambda l: ''.join([int_to_str[i] for i in l])

print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


# Encode the full corpus and store in a torch.Tensor

In [42]:

# -- Encode the full text corpus, then store the encoding in a torch.Tensor
data = torch.tensor(encode(text),dtype=torch.long)

print(data.shape, data.dtype)
# Show first 100 characters in the tensor
print(data[:100])

torch.Size([1115393]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


# Split Data into Train & Test Sets


#### 90 / 10 split

In [43]:
n = int(.9*len(data))

train_data = data[:n]
test_data = data[n:]

# Set Block Size

#### Size of string (in # of characters) to be used for training the model

In [44]:
block_size = 8

train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

# Example to Understand How Sequences are Parsed to Context & Targets

#### A single batch input of block_size length is parsed

In [46]:

# Inputs to the transformer
x = train_data[:block_size]
# The next input to the transformer (ie. the value being predicted)
y = train_data[block_size:]


# -- For a range of t to block_size, print the context (what is shown to the model) and the target (next value in the sequence)
for t in range(block_size):
    context = x[:t+1]
    target = y[t]

    print(f"when input is {context}, the target is: {target}")

when input is tensor([18]), the target is: 58
when input is tensor([18, 47]), the target is: 47
when input is tensor([18, 47, 56]), the target is: 64
when input is tensor([18, 47, 56, 57]), the target is: 43
when input is tensor([18, 47, 56, 57, 58]), the target is: 52
when input is tensor([18, 47, 56, 57, 58,  1]), the target is: 10
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 0
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 14


# Example to Understand How Batches of Sequences are Parsed to Context & Targets

#### Multiple batches of input of tensor stacks (batch_size by block_size matrix) are parsed

In [54]:
torch.manual_seed(1337)

# -- batch_size = # of independent sequences to be processed in parallel
batch_size = 4
# -- maximum context length (length of characters shown to model) for predictions
block_size = 8

def get_batch(split): 


    if split == 'train':
        data = train_data
    else:
        data = test_data

    # Generate random positions to select chunks of characters to pass to the model
    # -- generate batch_size length chunks of character, with each character representing an integer between 0 and len(data) - block_size
    # -- ix = batch_size length of characters of random numbers between 0 and len(data) - block_size
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # For each index i in ix, extract a sequence of characters from data for index i through i + block_size
    # -- user torch.stack to stack each individual tensor as a row in a batch_size by block_size matrix
    x = torch.stack([data[i:i+block_size] for i in ix])
    # For each index i in ix, offset by 1 to get the next sequence of characters (the target value)
    # -- user torch.stack to stack each individual tensor as a row in a batch_size by block_size matrix
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

xbatch, ybatch = get_batch('train')

print('Example of a batch of inputs to the model: ')
print(xbatch.shape)
print(xbatch)
print('Example of batch of targets for the model to predict: ')
print(ybatch.shape)
print(ybatch)

print(' =================== ')

for b in range(batch_size):     # batch (b) dimension
    for t in range(block_size):     # time (t) dimension

        context = xbatch[b, :t+1]
        target = ybatch[b,t]
        print(f"when input is {context.tolist()}, the target is: {target}")

Example of a batch of inputs to the model: 
torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
Example of batch of targets for the model to predict: 
torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])
when input is [53], the target is: 59
when input is [53, 59], the target is: 6
when input is [53, 59, 6], the target is: 1
when input is [53, 59, 6, 1], the target is: 58
when input is [53, 59, 6, 1, 58], the target is: 56
when input is [53, 59, 6, 1, 58, 56], the target is: 47
when input is [53, 59, 6, 1, 58, 56, 47], the target is: 40
when input is [53, 59, 6, 1, 58, 56, 47, 40], the target is: 59
when input is [49], the target is: 43
when input is [49, 43], the target is: 43
when input is [49, 43, 43], the target is:

# Instanciate a Neural Network Model

#### For ease, use a Bigram Language Model

# !!! Refer to Andrej Karpathy's makemore series on youtube to better understand the Bigram Language Model created below

In [62]:
# set seed
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size): 
        super().__init__()

        # Upon initializing this class, create a token_embedding_table of  size vocab_size x vocab_size
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets):

        # From the token_embedding_table (of size vocab_size x vocab_size)...
        # ... pass each index value (idx) to extract the idx row of the token_embedding_table...
        #... then, arrange them in a B x T x C matrix
        logits = self.token_embedding_table(idx)  # (B - batch, T - time, C - channel)

        return logits

# Instantiate a model using the class we created
model = BigramLanguageModel(vocab_size)

# Pass inputs (xbatch) and targets (ybatch) to the model
output = model(xbatch, ybatch)

print(output.shape)


torch.Size([4, 8, 65])
