Import libraries 

In [1]:
import torch
import torch.nn as nn
from torchtext import data
from torchtext import datasets

from torchtext.vocab import GloVe

Use tokenizer in spacy<br />
After installing  spacy, additional operation is needed to install language models<br />
See https://spacy.io/usage/models for more info

In [2]:
import spacy

# Use English model
spacy_en = spacy.load('en')

# create a tokenizer function
def tokenizer(text): 
    text = text.replace("<br />", " ")
    return [tok.text for tok in spacy_en.tokenizer(text)]

Define attributes in the dataset.<br />
Here we use the TREC dataset as example. In the dataset there are only two attributes: text and label<br />
We specify steps of preprocessing as parameter. <br />For detailed documentation of all the parameters, see http://torchtext.readthedocs.io/en/latest/data.html#field

In [3]:
# sequential = True: specify that the data is sequential
# include_lengths = True: a list that store the length of each sentence will also be returned
# lower = True: lowercase all words
# tokenize = tokenizer: use out pre-defined tokenizer to tokenize the text. Default is text.split()
# init_token = "<SOS>": a <SOS> will be added before the beginning of the text sequence
# eos_token = "<EOS>": a <EOS> will be added after the end of the text sequence
# We did not specify a parameter "use_vocab" cause it's already set to True, 
#        this parameter specify that all words in the text sequence will be converted into indexes
#        and we can see the corresponding words to the indexes using TEXT.vocab.stoi
TEXT = data.Field(sequential=True, include_lengths=True, lower=True, 
                  tokenize=tokenizer, init_token='<SOS>', eos_token='<EOS>')

# Label is our predict target, and no further operation is needed in this phrase
LABEL = data.LabelField()

In [4]:
# We use the in-built dataset TREC, and split it into train and test dataset
# Some other in-built dataset contains validation dataset. e.g., SST
# In that case you can use the following line of code to get the three subdataset
# train, test, val = datasets.SST.splits(TEXT, LABEL)
train, test = datasets.TREC.splits(TEXT, LABEL)

Here we print out an Example in the training dataset.<br />
You might notice that the data we get is a torchtext.data.example.Example object.<br />
To print out the attribute of this object, use ".attribute"

In [5]:
for num, batch in enumerate(train):
    print(num)
    print(batch)
    print(batch.text)
    print(batch.label)
    break

0
<torchtext.data.example.Example object at 0x7f1980aa09b0>
['how', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'russia', '?']
DESC


To convert words to indexes, we need to use the build_vocab() method from torchtext.data.Field object.

In [6]:
# Use the in-built word embedding model GloVe
# You can use your own word embedding model using torchtext.vocab.Vectors
TEXT.build_vocab(train, test, vectors="glove.6B.300d", max_size=30000) 

# Serialize the label
LABEL.build_vocab(train, test)

In [7]:
# The converted word embeddings are stored under TEXT.vocab.vectors
TEXT.vocab.vectors

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.2082,  0.1944,  0.2650,  ...,  0.4414,  0.8425,  0.4868],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4210, -0.5008,  0.5342,  ...,  0.6741,  0.1884,  0.3817]])

In [8]:
# The index-word correpondance is recorded in TEXT.vocab.stoi as a dictionary
# We build a lookup dictionary for later use
lookup = {b:a for a, b in TEXT.vocab.stoi.items()}
lookup[30]

'first'

Split the dataset into batches

In [9]:
# batch_size = 32: In each batch contains 32 instaces. 
#       It has been proved that training in a batch gives better performance than learn a single data at a time
# device = torch.device("cuda"): Use GPU. If you do not have a GPU enviornment, use torch.device("cpu") instead
# repeat = False: Do not repeat the iterator for multiple epochs
# sort_key = lambda...: A key to use for sorting examples in order to batch together examples with similar lengths 
#       and minimize padding. The sort_key provided to the Iterator constructor overrides the sort_key attribute of  
#       the Dataset, or defers to it if None
train_iter, test_iter = data.BucketIterator.splits(
          (train, test), batch_size=32, device=torch.device("cuda"), repeat=False, sort_key=lambda x: len(x.Text))

Here we print out the data in train_iter

In [10]:
for num, batch in enumerate(train_iter):
    # batch is a torchtext.data.batch.Batch object
    # In batch there are 32 instances
    # The data in batch have two fields: text and label
    # The text field contains the text index itself [len(longest sentence)x32] and 
    #          the length of each setence [32] in this batch (remember include_lengths=True?)
    # The label field contains the prediction target [32]
    print("batch:")
    print(batch, "\n")
    
    # Print the label
    print("batch.label:")
    print(batch.label, "\n")
    
    # Print the first component of batch.text (the sequence of words)
    print("batch.text[0]:")
    print(batch.text[0], "\n")
    
    # Print the first element in batch.text[0], which is all first words in all the sentences
    # You may notice that the results are all 2
    # That's because we use <SOS> to represent the beginning of a sentence
    # Going back to TEXT.vocab.stoi you can find that the index of <SOS> is 2
    print("batch.text[0][0]:")
    print(batch.text[0][0], "\n")
    
    # Print the actual content fifth sentence in this batch
    # Noted that we use the lookup dictionary to recover the word index to word
    print("Fifth sentence: ")
    for i in range(batch.text[0].size()[0]):
        print(lookup[batch.text[0][i].tolist()[5]], end=" ")
    # You might find that torchtext automatically add <PAD>s after <EOS> to make all sentence 
    #           having identical length in this batch
    break

batch:

[torchtext.data.batch.Batch of size 32 from TREC]
	[.text]:('[torch.cuda.LongTensor of size 26x32 (GPU 0)]', '[torch.cuda.LongTensor of size 32 (GPU 0)]')
	[.label]:[torch.cuda.LongTensor of size 32 (GPU 0)] 

batch.label:
tensor([ 1,  1,  4,  4,  0,  3,  1,  0,  3,  0,  1,  3,  4,  2,
         2,  5,  4,  1,  1,  2,  1,  0,  5,  3,  4,  4,  0,  1,
         0,  1,  0,  1], device='cuda:0') 

batch.text[0]:
tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2],
        [    6,     6,     6,    29,     6,    11,    11,     6,    11,
             6,    11,    31,     6,     6,     6,     6,     6,     6,
             6,    31,     6,    25,     6,    11,     6,     6,     6,
             6,     6,     6,     6,     6],
        [    7,     7,   132,     7,   212,    28,    24,   

Finally, copy the word embeddings to torch.nn.Embedding object
Then you can use this embed for your RNN model! :)

In [11]:
# The second parameter(300) is the number of dimension of your embedding model
# padding_idx = 1: specify the <PAD>'s index is 1
# max_norm = 1: If given, will renormalize the embeddings to always have a norm lesser than the given number
embed = nn.Embedding(len(TEXT.vocab), 300, padding_idx=1, max_norm=1)

# Copy the word vectors from dataset to nn.Embedding object
embed.weight.data.copy_(TEXT.vocab.vectors)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.2082,  0.1944,  0.2650,  ...,  0.4414,  0.8425,  0.4868],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4210, -0.5008,  0.5342,  ...,  0.6741,  0.1884,  0.3817]])