In [3]:
#Get the dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

zsh:1: command not found: wget


In [4]:
#Get Input Text
with open('input.txt', 'r') as f:
    text = f.read()

In [5]:
#Get All Unique Characters
chars = sorted(list(set(text)))
print(chars)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [13]:
#Tokenize the text- This will be used to convert the text to numbers
char_to_idx = {ch:i for i,ch in enumerate(chars)}
idx_to_char = {i:ch for i,ch in enumerate(chars)}

#Encode and Decode Functions for Strings
def encode(s):
    return [char_to_idx[ch] for ch in s]
def decode(l):
    return ''.join([idx_to_char[i] for i in l])

#Test
encodedText = encode("Hello!")
print(encodedText)
print(decode(encodedText))

#Note: This is a very simple tokenization method. There are better ways to do this. Example: Using tiktoken from OpenAI. (Sub-word tokenization = Don't need a new token for every letter)


[20, 43, 50, 50, 53, 2]
Hello!


In [14]:
#Some Imports
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
#Encode the input text and store it into a Torch Tensor
#A tensor is a multi-dimensional matrix containing elements of a single data type

#Convert the text to numbers
encodedText = encode(text)
print("Length of encoded text: ", len(encodedText))

#Convert the list to a tensor
data = torch.tensor(encodedText, dtype=torch.long)
print(data.shape)
print(data[:10]) #First 10 elements (First 10 characters in the text)

Length of encoded text:  1115394
torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


In [25]:
#Get Training and Validation Data
#We will use the first 90% of the data for training and the last 10% for validation

#Get the length of the data
dataLen = len(data)
print("Length of data: ", dataLen)

#Get the length of the training data  
trainLen = int(dataLen * 0.9)

#Get the training data
train_data = data[:trainLen]
print("Length of training data: ", len(train_data))

#Get the validation data
val_data = data[trainLen:]  
print("Length of validation data: ", len(val_data))

Length of data:  1115394
Length of training data:  1003854
Length of validation data:  111540


In [31]:
block_size = 8 #The length of the sequence we want to predict
#Example: If block_size = 8, then we want to predict the 9th character in the sequence
#This is why we will need to train the model on sequences of length block_size + 1

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size): 
    context = x[:t+1]
    target = y[t]
    print('Context: ', str(context), ' Target: ', str(target)) 
    #Ex. If data is {1,2,3} and we are given 1, then the target (predicted output) should be 2

Context:  tensor([18])  Target:  tensor(47)
Context:  tensor([18, 47])  Target:  tensor(56)
Context:  tensor([18, 47, 56])  Target:  tensor(57)
Context:  tensor([18, 47, 56, 57])  Target:  tensor(58)
Context:  tensor([18, 47, 56, 57, 58])  Target:  tensor(1)
Context:  tensor([18, 47, 56, 57, 58,  1])  Target:  tensor(15)
Context:  tensor([18, 47, 56, 57, 58,  1, 15])  Target:  tensor(47)
Context:  tensor([18, 47, 56, 57, 58,  1, 15, 47])  Target:  tensor(58)


In [35]:
from sympy import *
# exp(-1/100*300)*((1/100)*300)**x*(factorial(x))**-1
#Add the above from x = 0 to x = 2
sum = 0
for x in range(3):
    sum += exp(-1/100*300)*((1/100)*300)**x*(factorial(x))**-1
print(1-sum)

0.576809918873156


In [None]:
#