In [101]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path:Path):
    if data_file_path.exists():
        print(f'{data_file_path} already exists. Skipping download and extraction.')
        return
    with urllib.request.urlopen(url) as response:
        with open(zip_path, 'wb') as out_file:
            out_file.write(response.read())

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

sms_spam_collection\SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [102]:
data_file_path

WindowsPath('sms_spam_collection/SMSSpamCollection.tsv')

In [103]:
import pandas as pd
df = pd.read_csv(data_file_path, sep='\t', header=None, names=["Label", "Text"])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [104]:
sum(df['Label'] == 'ham'),sum(df['Label'] == 'spam')

(4825, 747)

In [105]:
df['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [106]:
df[df['Label']=='spam'].shape

(747, 2)

In [107]:
def create_balanced_dataset(df:pd.DataFrame):
    spam_nums = df[df['Label']=='spam'].shape[0]
    ham_subset = df[df['Label']=='ham'].sample(spam_nums, random_state=123)
    balanced_df = pd.concat([ham_subset, df[df['Label']=='spam']])
    return balanced_df

balanced_df = create_balanced_dataset(df)
balanced_df["Label"] = balanced_df["Label"].map({"ham":0,"spam":1}) # 将标签映射成0和1
balanced_df['Label'].value_counts()

Label
0    747
1    747
Name: count, dtype: int64

In [108]:
# 前面经过了抽取，此处的索引还是最开始加载的数据的索引，而不是0,1,2,3...
balanced_df['Label']

4307    0
4138    0
4831    0
4461    0
5440    0
       ..
5537    1
5540    1
5547    1
5566    1
5567    1
Name: Label, Length: 1494, dtype: int64

In [109]:
def random_split(df:pd.DataFrame, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    validation_end =  train_end + int(len(df) * validation_frac)

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [110]:
import torch
from torch.utils.data import Dataset
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded_texts = [tokenizer.encode(text) for text in self.data['Text']]
        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            self.encoded_texts = [encoded_text[:max_length] for encoded_text in self.encoded_texts]
        # pad
        self.encoded_texts = [encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts]

    def __getitem__(self, index):
        label = self.data.iloc[index]['Label']
        encoded = self.encoded_texts[index]
        return (torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long))
    
    def __len__(self):
        return len(self.encoded_texts)
    
    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            if max_length < len(encoded_text):
                max_length = len(encoded_text)
        return max_length

In [111]:
train_dataset = SpamDataset(csv_file="train.csv", max_length=None, tokenizer=tokenizer)
print(train_dataset.max_length)

120


In [112]:
val_dataset = SpamDataset(csv_file="validation.csv", max_length=train_dataset.max_length, tokenizer=tokenizer)
test_dataset = SpamDataset(csv_file="test.csv", max_length=train_dataset.max_length, tokenizer=tokenizer)

In [113]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)

In [114]:
train_dataset[0]

(tensor([   35,  2507,   703,   466,   345,   588,   262,  6940,  2344,    13,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
 tensor(0))

In [115]:
for input_batch, target_batch in train_loader:
    pass
print("Input batch dimensions:", input_batch.shape)
print("Label batch dimensions:", target_batch.shape)

Input batch dimensions: torch.Size([8, 120])
Label batch dimensions: torch.Size([8])


In [117]:
print(f'Training batch: {len(train_loader)}')
print(f'Validation batch: {len(val_loader)}')
print(f'Test batch: {len(test_loader)}')

Training batch: 130
Validation batch: 19
Test batch: 38


In [118]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"
BASE_CONFIG = {
    "vocab_size": 50257, # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0, # Dropout rate
    "qkv_bias": True # Query-key-value bias
}
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

assert train_dataset.max_length <= BASE_CONFIG["context_length"], (
    f"Dataset length {train_dataset.max_length} exceeds model's context "
    f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with "
    f"`max_length={BASE_CONFIG['context_length']}`"
)

In [119]:
torch.arange(10)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [125]:
import torch.nn as nn
pos_embed_layer = nn.Embedding(100, 256)
pos_embed_layer(torch.arange(10)).unsqueeze(0).transpose(-2,-1).shape
# torch.tensor([1]).unsqueeze

torch.Size([1, 256, 10])

In [133]:
torch.rand(5,5).masked_fill_(torch.triu(torch.ones(5,5),diagonal=1).bool(), -torch.inf)

tensor([[0.8486,   -inf,   -inf,   -inf,   -inf],
        [0.5940, 0.2000,   -inf,   -inf,   -inf],
        [0.1176, 0.6319, 0.3527,   -inf,   -inf],
        [0.9003, 0.8077, 0.3576, 0.3578,   -inf],
        [0.9110, 0.7447, 0.8595, 0.4132, 0.9939]])

In [136]:
torch.triu(torch.ones(5,5),diagonal=1).bool()[:4,:4]

tensor([[False,  True,  True,  True],
        [False, False,  True,  True],
        [False, False, False,  True],
        [False, False, False, False]])

In [None]:
## 尝试重新手写GPT模型

import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, emb_dim, n_heads, qkv_bias, drop_rate, context_length):
        super().__init__()
        assert emb_dim % n_heads == 0, 'error'
        self.emb_dim = emb_dim
        self.n_heads = n_heads
        self.head_dim = emb_dim // n_heads
        self.W_q = nn.Linear(emb_dim, emb_dim, bias=qkv_bias)
        self.W_k = nn.Linear(emb_dim, emb_dim, bias=qkv_bias)
        self.W_v = nn.Linear(emb_dim, emb_dim, bias=qkv_bias)
        self.W_o = nn.Linear(emb_dim, emb_dim, bias=False)
        self.droput = nn.Dropout(drop_rate)
        # 写错了，不用赋值
        #self.mask = self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1).bool())
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1).bool())

    def forward(self, x:torch.Tensor): # shape (B,L,D)
        batch, seq_len, dim = x.shape
        # 先投影
        queries = self.W_q(x)
        keys = self.W_k(x)
        values = self.W_v(x)
        # 拆头
        queries = queries.view(batch, seq_len, self.n_heads, self.head_dim)
        keys = keys.view(batch, seq_len, self.n_heads, self.head_dim)
        values = values.view(batch, seq_len, self.n_heads, self.head_dim)
        # 交换位置
        queries = queries.transpose(-2, -3) # shape (B,H,L,H_D)
        keys = keys.transpose(-2, -3)
        values = values.transpose(-2, -3)
        # 计算
        scores = queries @ keys.transpose(-1,-2) / (keys.shape[-1] ** 0.5)
        scores.masked_fill_(self.mask[:seq_len, :seq_len], -torch.inf) # shape (B,H,L,L)
        # 这里也写错了，droput的位置错了，应该是对归一化后得到的注意力权重应用dropout，那下面这种写法可以么？
        #scores = self.droput(scores)
        #attn_weights = torch.softmax(scores) @ values # shape (B,H,L,H_D)
        # 改成：
        # dropout要在softmax之后做，dropout的目的让模型不要过度的依赖某些特定的位置，而是更多的关注上下文，
        # 所以dropout的目的是随机的让某些位置的注意力为0，这样在与Value矩阵相乘计算context_vec的时候会忽略掉那些被dropout掉的位置的信息
        # 如果将dropout放在softmax之前，是达不到这个效果的，因为经过dropout后注意力分数变成0的位置，在经过softmax后不再是0.
        attn_weights = torch.softmax(scores, dim=-1)
        attn_weights = self.droput(attn_weights)
        context_vec = attn_weights @ values
        context_vec = context_vec.transpose(-2,-3)
        context_vec = context_vec.contiguous().view(batch, seq_len, dim)
        # 输出
        out = self.W_o(context_vec)
        return out

class FeedLayer(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.lin1 = nn.Linear(emb_dim, emb_dim * 4)
        self.lin2 = nn.Linear(emb_dim * 4, emb_dim)
        self.act = nn.GELU()

    def forward(self, x): # (B,L,D)
        x = self.lin1(x)
        x = self.act(x)
        return self.lin2(x)

class TransformerBlk(nn.Module):
    def __init__(self, cfg:dict):
        super().__init__()
        self.norm1 = nn.LayerNorm(cfg['emb_dim'])
        self.norm2 = nn.LayerNorm(cfg['emb_dim'])
        self.attn = MultiHeadAttention(cfg['emb_dim'], cfg['n_heads'], cfg['qkv_bias'], cfg['drop_rate'])
        self.feed = FeedLayer(cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        # 注意力层
        residual = x
        x = self.norm1(x)
        x = self.attn(x)
        # 原来写错了: 不是加了残差才dropout的，而是在注意力模块的输出加dropout
        #x = x + residual
        #x = self.dropout(x) #?
        x = self.dropout(x)
        x = x + residual

        # 前馈层
        residual = x
        x = self.norm2(x)
        x = self.feed(x)
        x = self.dropout(x) # 漏了
        x = x + residual
        return x

class GPTModel(nn.Module):
    def __init__(self, cfg:dict):
        super().__init__()
        self.embed_layer = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_embed_layer = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.embed_drop = nn.Dropout(cfg['drop_rate'])
        self.trf_blks = nn.Sequential(*[TransformerBlk(cfg) for _ in range(cfg['n_layers'])])
        self.final_norm = nn.LayerNorm(cfg['emb_dim']) # 漏了
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'])

    def forward(self, inputs): # inputs shape:(B,L)
        # embeding
        length = inputs.shape[1]
        embed = self.embed_layer(inputs)
        pos_embed = self.pos_embed_layer(torch.arange(length)).unsqueeze(0)
        x = embed + pos_embed # shape: (B,L,D)
        x = self.embed_drop(x) # 在得到嵌入后应用dropout，思路是：让模型在理解的时候，不依赖特定位置的输入
        # attn
        x = self.trf_blks(x) # shape:(B,L,D)
        x = self.final_norm(x)
        # out
        x = self.out_head(x)
        return x # shape(B,L,V)
