## Overview

This tutorial is extracted from [https://github.com/bentrevett/pytorch-sentiment-analysis](#https://github.com/bentrevett/pytorch-sentiment-analysis).

In [13]:
import time
import random
import collections

import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
train_data, test_data = datasets.IMDB()
print(f"Number of training examples: {len(list(train_data))}")
print(f"Number of testing examples: {len(list(test_data))}")



Number of training examples: 25000
Number of testing examples: 25000


In [7]:
class Tokenizer:
    def __init__(self, tokenize_fn="spacy", lower=True, max_length=None):
        self.tokenize_fn = get_tokenizer(tokenize_fn, "en_core_web_sm")
        self.lower = lower
        self.max_length = max_length

    def tokenize(self, s):
        tokens = self.tokenize_fn(s)

        if self.lower:
            tokens = [token.lower() for token in tokens]
        if self.max_length is not None:
            tokens = tokens[:self.max_length]

        return tokens

In [9]:
max_length_ = 500
tokenizer_ = Tokenizer(max_length=max_length_)

In [16]:
def yield_tokens(dataset):
    for _, text in dataset:
        yield tokenizer_.tokenize(text)
vocab_ = build_vocab_from_iterator(yield_tokens(train_data))

In [None]:
from torchtext import data

data.functional.sentencepiece_numericalizer()
