In [1]:
import argparse
import os, sys
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import math

from torchtext.vocab import GloVe

In [2]:
import spacy
spacy_en = spacy.load('en')
def tokenizer(text): # create a tokenizer function
    text = text.replace("<br />", " ")
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [3]:
TEXT = data.Field(sequential=True, include_lengths=True, lower=True, 
                  tokenize=tokenizer, init_token='<SOS>', eos_token='<EOS>')
LABEL = data.LabelField()
train, test = datasets.TREC.splits(TEXT, LABEL)

In [4]:
for num, batch in enumerate(train):
    print(num)
    print(batch.text)
    print(batch.label)
    break

0
['how', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'russia', '?']
DESC


In [5]:
TEXT.build_vocab(train, test, vectors="glove.6B.300d", max_size=30000) 
LABEL.build_vocab(train, test)

In [9]:
lookup = {b:a for a, b in TEXT.vocab.stoi.items()}
lookup

{0: '<unk>',
 1: '<pad>',
 2: '<SOS>',
 3: '<EOS>',
 4: '?',
 5: 'the',
 6: 'what',
 7: 'is',
 8: 'of',
 9: 'in',
 10: 'a',
 11: 'how',
 12: "'s",
 13: 'was',
 14: 'who',
 15: 'to',
 16: ',',
 17: 'are',
 18: 'for',
 19: 'and',
 20: 'did',
 21: 'does',
 22: "''",
 23: '``',
 24: 'do',
 25: 'name',
 26: '-',
 27: 'on',
 28: 'many',
 29: 'where',
 30: 'first',
 31: 'when',
 32: 'i',
 33: 'you',
 34: 'can',
 35: 'from',
 36: 'world',
 37: 'which',
 38: 'u.s.',
 39: 'that',
 40: 'most',
 41: 'by',
 42: 'an',
 43: 'country',
 44: 'as',
 45: 'city',
 46: 'with',
 47: 'have',
 48: 'has',
 49: '.',
 50: 'why',
 51: "'",
 52: 'it',
 53: 'there',
 54: 'state',
 55: 'year',
 56: 'be',
 57: 'called',
 58: 'president',
 59: 'at',
 60: 'people',
 61: 'get',
 62: 'were',
 63: 'find',
 64: 'his',
 65: 'american',
 66: 'mean',
 67: 'two',
 68: 'largest',
 69: 'made',
 70: 'war',
 71: 'new',
 72: 'much',
 73: 'fear',
 74: 'between',
 75: 'long',
 76: 'its',
 77: ':',
 78: 'used',
 79: 'word',
 80: 'know

In [8]:
train_iter, test_iter = data.BucketIterator.splits(
          (train, test), batch_size=32, device=torch.device("cuda"), repeat=False, sort_key=lambda x: len(x.Text))

In [18]:
for num, batch in enumerate(train_iter):
    print(batch)
    print(batch.text[0][0])
    for i in range(batch.text[0].size()[0]):
        print(lookup[batch.text[0][i].tolist()[5]])
    break


[torchtext.data.batch.Batch of size 32 from TREC]
	[.text]:('[torch.cuda.LongTensor of size 21x32 (GPU 0)]', '[torch.cuda.LongTensor of size 32 (GPU 0)]')
	[.label]:[torch.cuda.LongTensor of size 32 (GPU 0)]
tensor([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2], device='cuda:0')
<SOS>
what
city
has
a
newspaper
called
the
plain
dealer
?
<EOS>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
