In [2]:
import torchtext

# vectors 使用

In [12]:
glove_vocab = torchtext.vocab.Vectors(name='glove.6B.100d.txt',cache='H:\DBAI\word_vec\glove.6B')

In [13]:
examples = ['chip', 'baby', 'Beautiful']
ret = glove_vocab.get_vecs_by_tokens(examples, lower_case_backup=True)
ret.shape

torch.Size([3, 100])

# vocab 使用

In [9]:
from tqdm.notebook import tqdm
import random
import os
def read_imdb(folder='train', data_root=r"H:\DBAI\BenchMark_DataSet\imdb\aclImdb"): 
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

In [6]:
import collections
import re
def get_tokenized_imdb(data):
    """
    data: list of [string, label]
    """
    def tokenizer(text):
        text = re.sub('\.',' . ',text)
#         text = re.sub('\.',' .',text) 
        text = re.sub('<br />',' ',text) 
        return [tok.lower() for tok in text.split()]
    return [tokenizer(review) for review, _ in data]

def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return torchtext.vocab.Vocab(counter, min_freq=5)

In [10]:
train_data = read_imdb('train')
vocab = get_vocab_imdb(train_data)

HBox(children=(FloatProgress(value=0.0, max=12500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12500.0), HTML(value='')))




In [11]:
words = ['chip', 'baby', 'Beautiful']
[vocab.stoi[word] for word in words]

[9276, 1112, 0]

In [19]:
import torch
def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
#             print(word)
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed

load_pretrained_embedding(vocab.itos[8:9], glove_vocab)
# vocab.itos[8]

tensor([[-0.5426,  0.4148,  1.0322, -0.4024,  0.4669,  0.2182, -0.0749,  0.4733,
          0.0810, -0.2208, -0.1281, -0.1144,  0.5089,  0.1157,  0.0282, -0.3628,
          0.4382,  0.0475,  0.2028,  0.4986, -0.1007,  0.1327,  0.1697,  0.1165,
          0.3135,  0.2571,  0.0928, -0.5683, -0.5297, -0.0515, -0.6733,  0.9253,
          0.2693,  0.2273,  0.6636,  0.2622,  0.1972,  0.2609,  0.1877, -0.3454,
         -0.4263,  0.1398,  0.5634, -0.5691,  0.1240, -0.1289,  0.7248, -0.2610,
         -0.2631, -0.4360,  0.0789, -0.8415,  0.5160,  1.3997, -0.7646, -3.1453,
         -0.2920, -0.3125,  1.5129,  0.5243,  0.2146,  0.4245, -0.0884, -0.1780,
          1.1876,  0.1058,  0.7657,  0.2191,  0.3582, -0.1164,  0.0933, -0.6248,
         -0.2190,  0.2180,  0.7406, -0.4374,  0.1434,  0.1472, -1.1605, -0.0505,
          0.1268, -0.0144, -0.9868, -0.0913, -1.2054, -0.1197,  0.0478, -0.5400,
          0.5246, -0.7096, -0.3253, -0.1346, -0.4131,  0.3343, -0.0072,  0.3225,
         -0.0442, -1.2969,  

# 其他一整套原生的使用方式（不打算探究）
### Field本来是为了配置数据字段，但被赋予了过多功能
### 数据集的相关操作也被封装起来，但我觉得这在应用层面会有相当多的调整，并不适合封装
### 数据集的抓取倒是一个不错的资源管道


# dataset使用

In [20]:
# set up fields
TEXT = torchtext.data.Field(lower=True, include_lengths=False, batch_first=True)
LABEL = torchtext.data.Field(sequential=False)

# make splits for data
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL, root=r'H:\DBAI\BenchMark_DataSet')

# build the vocabulary
TEXT.build_vocab(train, 
                 vectors=torchtext.vocab.Vectors(name='glove.6B.100d.txt',
                                                 cache='H:\DBAI\word_vec\glove.6B'))
LABEL.build_vocab(train)

# make iterator for splits
train_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, test), batch_size=3, device=0)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [24]:
for idx,batch in enumerate(train_iter):
    print(batch,idx)
    print(batch.text)
    if idx==0:break


[torchtext.data.batch.Batch of size 3 from IMDB]
	[.text]:[torch.LongTensor of size 3x629]
	[.label]:[torch.LongTensor of size 3] 0
tensor([[   208,     19,   1065,  ...,      1,      1,      1],
        [    50,     69,    759,  ...,     18,  20108,   1858],
        [204292,     10,      7,  ...,      1,      1,      1]])


In [25]:
len(train_iter)

8334

In [26]:
help(torchtext.datasets)

Help on package torchtext.datasets in torchtext:

NAME
    torchtext.datasets

PACKAGE CONTENTS
    babi
    imdb
    language_modeling
    nli
    sequence_tagging
    sst
    text_classification
    translation
    trec
    unsupervised_learning

CLASSES
    torch.utils.data.dataset.Dataset(builtins.object)
        torchtext.datasets.text_classification.TextClassificationDataset
        torchtext.datasets.unsupervised_learning.EnWik9
    torchtext.data.dataset.Dataset(torch.utils.data.dataset.Dataset)
        torchtext.datasets.babi.BABI20
        torchtext.datasets.imdb.IMDB
        torchtext.datasets.language_modeling.LanguageModelingDataset
            torchtext.datasets.language_modeling.PennTreebank
            torchtext.datasets.language_modeling.WikiText103
            torchtext.datasets.language_modeling.WikiText2
        torchtext.datasets.sequence_tagging.SequenceTaggingDataset
            torchtext.datasets.sequence_tagging.CoNLL2000Chunking
            torchtext.datasets.