In [1]:
import re

from torchaudio.models.wav2vec2.components import SelfAttention

with open("the-verdict.txt", "r") as file:
    raw_text = file.read()

preprocessed = re.split(r'([,.;:?_"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [2]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
vocab = {token: integer for integer, token in enumerate(all_words)}
print(vocab_size)
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 10:
        break

1148
('"', 0)
("'", 1)
('(', 2)
(')', 3)
(',', 4)
('--', 5)
('.', 6)
(':', 7)
(';', 8)
('?', 9)
('A', 10)


In [3]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str2id = vocab
        self.id2str = {i: s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'[, .?_!"\'()|\s]', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        ids = []
        for s in preprocessed:
            if s in self.str2id:
                ids.append(self.str2id[s])
            else:
                print(f"警告: 词汇 '{s}' 在词汇表中没有找到")
                ids.append(-1)  # 可以给未知词汇一个默认值，或者做其他处理
        return ids

    def decode(self, ids):
        text = " ".join(self.id2str.get(i, '[UNK]') for i in ids)  # 对于未知的 ID，使用 [UNK]
        text = re.sub(r'\s+([, .?_!"\'()|\s])', r'\1', text)
        return text

In [4]:
tokenizer = SimpleTokenizerV1(vocab)
text = raw_text[:99]
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

警告: 词汇 'genius--though' 在词汇表中没有找到
警告: 词汇 'enough--so' 在词汇表中没有找到
[53, 44, 154, 1016, 57, 37, 829, 120, 261, -1, 120, 505, 440, -1, 592, 1091, 717]
I HAD always thought Jack Gisburn rather a cheap [UNK] a good fellow [UNK] it was no


In [5]:
text = "genius--though"
print(tokenizer.encode(text))

警告: 词汇 'genius--though' 在词汇表中没有找到
[-1]


In [6]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


In [7]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [8]:
enc_sample = enc_text[50:]
context_size = 4
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - max_length, stride):
            input = token_ids[i:i+max_length]
            target = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input))
            self.target_ids.append(torch.tensor(target))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [10]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
dataset = GPTDatasetV1(raw_text, tokenizer, max_length=4, stride=1)

dataloader = DataLoader(dataset, batch_size=3, shuffle=False)

for i, (input, target) in enumerate(dataloader):
    if i > 2:
        break
    print(f'Batch {i+1} Input: {input}')
    print(f'Batch {i+1} Target: {target}')

Batch 1 Input: tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619]])
Batch 1 Target: tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402]])
Batch 2 Input: tensor([[ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899]])
Batch 2 Target: tensor([[ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138]])
Batch 3 Input: tensor([[  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257],
        [10899,  2138,   257,  7026]])
Batch 3 Target: tensor([[  271, 10899,  2138,   257],
        [10899,  2138,   257,  7026],
        [ 2138,   257,  7026, 15632]])


In [12]:
import tiktoken
import torch

with open("the-verdict.txt",'r',encoding='utf-8') as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
context_length = 4
dataset = GPTDatasetV1(raw_text, tokenizer, max_length=4, stride=1)
dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

vocab_size = tokenizer.n_vocab
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
inputs, target = next(iter(dataloader))
token_embeddings = token_embedding_layer(inputs)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [14]:
import torch

inputs = torch.tensor(

[[0.43,0.15,0.89], # Your(X^1)
 [0.55,0.87,0.66], #journey(x^2)
 [0.57, 0.85,0.64], # starts(x^3)
 [0.22, 0.58, 0.33], # with(x^4)
 [0.77, 0.25, 0.10], # one(x^5)
 [0.05, 0.80,0.55]] # step(x^6)
)

query=inputs[1]#将第二个输入词元作为当前查询
attn_scores_2 = torch.empty(inputs. shape[0])

for i, x_i in enumerate(inputs):

    attn_scores_2[i]=torch.dot(x_i,query)#通过向量内积计算注意力分数
print("注意力分数：",attn_scores_2)

attn_weights_2_tmp=attn_scores_2/attn_scores_2.sum()#归化注意力分数
print("注意力权重：",attn_weights_2_tmp)

print("权重求和：",attn_weights_2_tmp.sum())
context_vec_2 = torch.zeros(query.shape)

for i, x_i in enumerate(inputs):

    context_vec_2 += attn_weights_2_tmp[i]*x_i #加权求和

print("当前查询的上下文向量：",context_vec_2)

attn_score = inputs @ inputs.T
attn_weights = torch.softmax(attn_score,dim=-1)
all_context_vecs = attn_weights @ inputs
print(all_context_vecs)

注意力分数： tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
注意力权重： tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
权重求和： tensor(1.0000)
当前查询的上下文向量： tensor([0.4355, 0.6451, 0.5680])
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


In [18]:
class SelfAttention(torch.nn.Module):

    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.value = torch.nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5,dim=-1)

        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(789)
d_in = inputs.shape[1]
d_out = 2
sa = SelfAttention(d_in, d_out)
print(sa(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)
