In [1]:
from data_loader import DataLoader

## Load TSV data

In [4]:
loaders = DataLoader(
    train_fn='./review_sampled_train.tsv',
    batch_size=128,
    valid_ratio=.2,
    device=-1, # 0 if want to use GPU
    max_vocab=999999,
    min_freq=5,
)

### Check loader

In [5]:
print("|train|=%d" % len(loaders.train_loader.dataset))
print("|valid|=%d" % len(loaders.valid_loader.dataset))

|train|=8000
|valid|=2000


In [6]:
print("|vocab|=%d" % len(loaders.text.vocab))
print("|label|=%d" % len(loaders.label.vocab))

|vocab|=3064
|label|=2


### Get mini-batch tensors

In [7]:
data = next(iter(loaders.train_loader))

print(data.text.shape)
print(data.label.shape)

torch.Size([128, 3])
torch.Size([128])


### Use vocab

In [8]:
dir(loaders.text.vocab)

['UNK',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_default_unk_index',
 'extend',
 'freqs',
 'itos',
 'load_vectors',
 'set_vectors',
 'stoi',
 'unk_index',
 'vectors']

In [9]:
loaders.text.vocab.stoi['배송']

18

In [10]:
loaders.text.vocab.itos[16]

'어요'

#### Check most frequent words

In [11]:
for i in range(50):
    word = loaders.text.vocab.itos[i]
    print('%5d: %s\t%d' % (i, word, loaders.text.vocab.freqs[word]))

    0: <unk>	0
    1: <pad>	0
    2: .	9325
    3: 고	5513
    4: 이	5090
    5: 하	4466
    6: 도	3658
    7: 네요	3651
    8: 좋	3438
    9: 에	3417
   10: 는	3134
   11: 가	2468
   12: 은	2467
   13: 는데	2064
   14: 잘	1975
   15: 게	1969
   16: 어요	1929
   17: 아요	1893
   18: 배송	1773
   19: 있	1580
   20: 했	1565
   21: 습니다	1458
   22: 안	1402
   23: 을	1402
   24: 한	1296
   25: ~	1288
   26: 구매	1224
   27: ,	1153
   28: 합니다	1148
   29: 같	1075
   30: !	1072
   31: 거	1067
   32: 지	1049
   33: 너무	1039
   34: 다	991
   35: ..	974
   36: 어	973
   37: 가격	854
   38: 되	851
   39: 제품	842
   40: 들	841
   41: 으로	835
   42: 받	829
   43: ?	827
   44: 아	821
   45: 것	815
   46: 았	806
   47: 만	803
   48: 요	792
   49: 나	786


#### Restore text from tensor

In [14]:
print(data.text[0])

tensor([  8,  21, 350])


In [16]:
x = data.text[0]
line = []
for x_i in x:
    line += [loaders.text.vocab.itos[x_i]]
    
print(' '.join(line))

좋 습니다 ~~~
