In [1]:
from fastai.text.all import *
from tokenizer import BytePairTokenizer
from bpe_train_fast import train_bpe_fast

In [2]:
# --- 1. Load, CLEAN, and Train (Your code is perfect) ---
path = uantar_datapath = untar_data(URLs.IMDB_SAMPLE)
df   = pd.read_csv(path/'texts.csv')
# # This is the crucial step you added that solves the indexing issue.
# df = df[df['text'].str.strip().ne('')].reset_index(drop=True)

itos, stoi, merges = train_bpe_fast(df["text"], vocab_size=2048)

--- Starting Corpus Build ---
  Line 0: Found 85 tokens. First 5: [b'Un', b'-', b'bleeping', b'-', b'believable']
  Line 1: Found 441 tokens. First 5: [b'This', b' is', b' a', b' extremely', b' well']
  Line 2: Found 204 tokens. First 5: [b'Every', b' once', b' in', b' a', b' long']
  Line 3: Found 179 tokens. First 5: [b'Name', b' just', b' says', b' it', b' all']
  Line 4: Found 382 tokens. First 5: [b'This', b' movie', b' succeeds', b' at', b' being']
--- Corpus Build Finished. Found 137626 unique pairs. ---
--- Initial State ---
Heap has 137626 items.
Top 5 most frequent pairs in heap:
  Pair: (b' /><', b'br'), Freq: 2108
  Pair: (b'br', b' /><'), Freq: 2108
  Pair: (b'br', b' />'), Freq: 2010
  Pair: (b' of', b' the'), Freq: 1692
  Pair: (b'.<', b'br'), Freq: 1548
-------------------------


In [None]:
# --- All your setup code is perfect ---
bpe_tok = BytePairTokenizer(itos, merges)
vocab_str = [b.decode('latin1') for b in itos]
pad_tok_id = stoi[b'<pad>']


class Trncate(Transform):
    "A transform that truncates a list of tokens to a max length"
    def __init__(self, max_len):
        self.max_len = max_len
    def encodes(self, tokens:list) -> list:
        return tokens[:self.max_len]

class BPENumericalize(Numericalize):
    "A Numericalize transform that decodes back to a list of tokens"
    def decodes(self, o:TensorText) -> list:
        # Return a list of string-tokens, not a single joined string
        return [self.vocab[o_] for o_ in o if self.vocab[o_]]

bpe_num = BPENumericalize(vocab=vocab_str) # pad_tok_id is not a valid argument for Numericalize

dblock = DataBlock(
    blocks=(TransformBlock(type_tfms=[bpe_tok, Trncate(128), bpe_num]),
            CategoryBlock),
    get_x=ColReader('text'),
    get_y=ColReader('label'),
    splitter=RandomSplitter(seed=42)
)

# --- 4. Create the DataLoaders, passing Pad_Chunk to the correct hook ---
dls = dblock.dataloaders(df, bs=16, seq_len=128, before_batch=Pad_Chunk(pad_idx=pad_tok_id))

# --- 5. VICTORY LAP ---
dls.show_batch(max_n=4)

--- Creating DataLoaders with the CORRECT padding hook ---
--- DataLoaders created successfully! ---


Unnamed: 0,text,category
0,"Come on. The new twist is nearly ok, but from avenging the Elm Street children Freddy is just killing people now. More of the sa",negative
1,"""Some day, we'll walk in the rays of a beautiful sun. Some day, when the world is much brighter""- The 5 Stairsteps ""O-o-h Child""",positive
2,"The pre-release version of 1933's ""Baby Face"" would make an ideal introduction to a corporate seminar on sexual harassment. Ment",positive
3,"Erich Rohmer's ""L'Anglaise et le duc"" makes a perfect companion piece to Peter Watkins' ""La Commune (Paris 1871)."" Both films -s",positive


In [25]:
x, y = dls.one_batch()

In [26]:
x.shape, y.shape

(torch.Size([16, 128]), torch.Size([16]))

In [27]:
x

TensorText([[ 65, 109, 101,  ...,  99, 114,  97],
            [ 84, 104, 105,  ..., 105, 115,  32],
            [ 65, 108, 109,  ...,  97, 108, 108],
            ...,
            [119, 101, 108,  ...,  32,  97, 115],
            [ 74,  97,  99,  ...,  66, 117, 116],
            [ 84, 104, 105,  ..., 100,  32, 116]], device='mps:0')