In [1]:
# 拆分raw_text
import re
with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
preprocessed = re.split(r"(\s|--|[,.:;?_!'\"()])",raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
context_length = len(preprocessed)

In [2]:
# 创造vocabulary
all_words = sorted(set(preprocessed))
all_words.extend(["<|endoftext|>","<|unk|>"])
print(len(all_words))
vocab = {token:i for i,token in enumerate(all_words)}
vocab_size = len(vocab)

1132


In [3]:
# 创建tokenid
token_ids = [vocab[item] for item in preprocessed]

In [4]:
# 创建input和target的dataset
import torch
from torch.utils.data import Dataset
class GPTDatasetV1(Dataset):
    def __init__(self,token_ids,max_length,stride):
        # max_length指一次最多读几个词
        # stride指相邻两次之间的步长
        self.input_ids = []
        self.target_ids = []
        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+1+max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    # 后面两个是必要的，DataLoader会调用
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


In [5]:
# 变成batches
from torch.utils.data import DataLoader
def create_dataloader_v1(token_ids,max_length,stride,batch_size,
                         shuffle=True,drop_last=True,num_workers=0):
    dataset = GPTDatasetV1(token_ids,max_length,stride)
    dataloader = DataLoader(dataset,batch_size,shuffle=shuffle,
                            drop_last=drop_last,num_workers=num_workers)
    return dataloader
max_length = 4
batch_size = 4
dataloader = create_dataloader_v1(token_ids,max_length,1,batch_size,False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)


[tensor([[  53,   44,  149, 1003],
        [  44,  149, 1003,   57],
        [ 149, 1003,   57,   38],
        [1003,   57,   38,  818]]), tensor([[  44,  149, 1003,   57],
        [ 149, 1003,   57,   38],
        [1003,   57,   38,  818],
        [  57,   38,  818,  115]])]


In [6]:
# 进入the embedding layer,这里只拿出一个batch
torch.manual_seed(123)
output_dim = 3
token_embedding_layer = torch.nn.Embedding(vocab_size,output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length,output_dim)
print(token_embedding_layer.weight)
for batch in dataloader:
    x,y = batch
    token_embeddings = token_embedding_layer(torch.tensor(x))
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))
    input_embeddings = token_embeddings+pos_embeddings
    break
print(input_embeddings)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035],
        [-0.5880,  0.3486,  0.6603],
        [-0.2196, -0.3792,  0.7671],
        ...,
        [-0.1045,  2.2521,  0.4200],
        [ 1.7311, -0.8560,  0.0338],
        [ 0.0447, -0.6207,  1.2244]], requires_grad=True)
tensor([[[ 0.6425,  0.3770,  1.8868],
         [-1.9874,  0.2148, -0.9491],
         [ 0.2694,  0.3936,  0.3067],
         [-3.4180, -0.4049, -0.7350]],

        [[ 0.3319, -0.5958, -0.7199],
         [-0.5046,  0.3557,  1.0224],
         [-2.0575,  0.3227, -1.0671],
         [ 0.0433, -0.0988, -1.9320]],

        [[ 1.8147, -0.4549,  1.2516],
         [-2.8314,  0.2848, -0.3515],
         [ 1.4039,  0.6287, -2.2641],
         [-0.3598, -0.4701, -0.2356]],

        [[-0.5121, -0.5259, -0.1223],
         [ 0.6299,  0.5909, -1.5485],
         [ 1.0008,  0.2574, -0.5677],
         [-0.7568, -1.2115,  0.3045]]], grad_fn=<AddBackward0>)


  token_embeddings = token_embedding_layer(torch.tensor(x))
