用torchtext来创建vocabulary，然后把数据读成batch的格式。

In [1]:
import torchtext
import torch
from torchtext.vocab import Vectors
import numpy as np
import random

In [2]:
USE_CUDA = torch.cuda.is_available()

random.seed(44)
np.random.seed(44)
torch.manual_seed(44)
if USE_CUDA:
    torch.cuda.manual_seed(44)

In [3]:
BATCH_SIZE = 32 #一个batch中有多少个句子（一列就是1个句子）
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 100
MAX_VOCAB_SIZE = 50000
BPTT_LEN = 50 #RNN往回传的时候的词向量个数
device = torch.device("cuda" if USE_CUDA else "cpu")

![QQ截图20200227131946.png](.\image\QQ截图20200227131946.png)

In [4]:
TEXT = torchtext.data.Field(lower=True)
train,val,test = torchtext.datasets.LanguageModelingDataset.splits(path=".\data",
                                                 train="text8.train.txt",
                                                 test="text8.test.txt",
                                                 validation="text8.dev.txt",
                                                 text_field=TEXT)

In [5]:
#这一步操作相当于Counter(text).most_common(SIZE)
TEXT.build_vocab(train,max_size=MAX_VOCAB_SIZE)

In [6]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']

![QQ截图20200227131245.png](.\image\QQ截图20200227131245.png)

In [7]:
TEXT.vocab.stoi.get("apple") #apple这个词有1273个

1273

定义一个iter，每个batch中有32个句子

In [8]:
train_iter,val_iter,test_iter = torchtext.data.BPTTIterator.splits((train,val,test),
                                                                  batch_size=BATCH_SIZE,
                                                                  device=device,
                                                                  bptt_len=BPTT_LEN,
                                                                   repeat=False,
                                                                   shuffle=True
                                                                  )

In [9]:
it = iter(train_iter)
batch = next(it)

In [10]:
print(batch)
print(batch.text)


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
tensor([[4815,   50,    6,  ..., 9116,   33,    7],
        [3143, 2748,  495,  ...,  893,  277,  317],
        [  13,    8,  850,  ...,  664,  824, 1602],
        ...,
        [   8,   34,  522,  ..., 5237,    3,   12],
        [3628, 1266,  968,  ...,    3,    2,    6],
        [   2,   54,   78,  ...,   12,  185, 3027]], device='cuda:0')


In [11]:
print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))

anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the
originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization


In [12]:
for j in range(5):
    batch = next(it)
    print(j)
    print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
    print()
    print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))    

0
organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing

of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations
1
interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or <unk> but rather a harmonious anti authoritarian society in place of what are regarded

of what this means anarchism also refers to rela

![QQ截图20200227140704.png](.\image\QQ截图20200227140704.png)

In [13]:
import torch.nn as nn

In [14]:
class LSTMModel(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_size):
        super(LSTMModel,self).__init__()
        self.embed = nn.Embedding(vocab_size,embed_size)
        self.lstm = nn.LSTM(embed_size,hidden_size)
        self.linear = nn.Linear(hidden_size,vocab_size)
        self.hidden_size = hidden_size
        
    def forward(self,text,hidden):
        #forward pass
        #text: seq_length * batch_size
        emb = self.embed(text)# seq_size * batch_size * embed_size
        output,hidden = self.lstm(emb,hidden)
        #具体的看pytorch官网
        #output: seq_length * batch_size * hidden_size
        #hidden: (1*batch_size*hidden_size,1*batch_size*hidden)
        #output = # (seq_length * batch_size) *hidden_size
        out_vocab = self.linear(output.view(-1,output.shape[2])) #(seq_length * batch_size) * vocab_size
        out_vocab = out_vocab.view(output.size(0),output.size(1),out_vocab.size(-1))
        return out_vocab,hidden
    def init_hidden(self,bsz,requires_grad=True):
        #以下是技巧写法，不是固定的
        weight = next(self.parameters())
        return (weight.new_zeros((1,bsz,self.hidden_size),requires_grad=True),
               weight.new_zeros((1,bsz,self.hidden_size),requires_grad=True))

![QQ截图20200227142001.png](.\image\QQ截图20200227142001.png)

In [15]:
model = LSTMModel(vocab_size=len(TEXT.vocab),
                 embed_size=EMBEDDING_SIZE,
                 hidden_size=HIDDEN_SIZE)
if USE_CUDA:
    model = model.to(device)

In [16]:
next(model.parameters())

Parameter containing:
tensor([[ 1.5862,  1.1253,  1.8306,  ..., -0.4256, -0.5353, -0.5766],
        [ 1.9729,  0.1628, -0.5393,  ..., -0.8279, -1.1062, -0.5235],
        [ 1.4656,  0.6486,  1.0052,  ..., -0.3202, -1.2253, -1.3718],
        ...,
        [-0.2977,  1.0607,  0.1781,  ..., -1.1225, -0.6912,  0.0626],
        [-1.0276, -0.1802,  1.7577,  ...,  0.2666, -1.6965,  0.7423],
        [-0.3957,  0.6342,  0.0749,  ..., -2.2819,  0.9556,  0.9199]],
       device='cuda:0', requires_grad=True)

![QQ截图20200227152348.png](.\image\QQ截图20200227152348.png)
我们需要定义下面的一个function，帮助我们把一个hidden state和计算图之前的历史分离

![QQ截图20200227152529.png](.\image\QQ截图20200227152529.png)

In [17]:
def repackage_hidden(h):
    if isinstance(h,torch.Tensor):
        #h这个虽然看起来是个Tensor，但实际上是个结点，与之前的结点是有联系的，
        #这里将h分离开来成为一个全新的结点以避免将前面所有的历史节点数据都一起训练导致内存爆表
        return h.detach()
    else:
        #提前知道了还可能出现的类型是tuple
        return tuple(repackage_hidden(v) for v in h)

In [18]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
#将learning_rate自动调低
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,0.5) #0.5表示lr降一半

In [19]:
VOCAB_SIZE = len(TEXT.vocab)

In [20]:
def evaluate(model,data):
    model.eval()
    total_loss = 0.
    total_count = 0.
    it = iter(data)
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE,requires_grad=False)
        for i,batch in enumerate(it):
            data,target = batch.text,batch.target
            hidden = repackage_hidden(hidden)
            output,hidden = model(data,hidden)
        
            loss = loss_fn(output.view(-1,VOCAB_SIZE),target.view(-1))
            total_loss = loss.item() * np.multiply(*data.size())#data.size() is a tuple
            total_count = np.multiply(*data.size())
    loss = total_loss / total_count
    model.train()
    return loss

In [21]:
# NUM_EPOCHS = 2
# GRAD_CLIP = 5.0 #将所有参数梯度控制在5.0以下

# val_losses = []
# for epoch in range(NUM_EPOCHS):
#     model.train()
#     it = iter(train_iter)
#     hidden = model.init_hidden(BATCH_SIZE)
#     for i,batch in enumerate(it):
#         data,target = batch.text,batch.target
#         hidden = repackage_hidden(hidden)
#         output,hidden = model(data,hidden) #bacpropgate through all the iteration
        
#         loss = loss_fn(output.view(-1,VOCAB_SIZE),target.view(-1))#batch_size*target_class_dim*batch_size
#         optimizer.zero_grad()
#         loss.backward()
        
#         #将所有参数梯度控制在5.0以下
#         torch.nn.utils.clip_grad_norm_(model.parameters(),GRAD_CLIP)
        
#         optimizer.step()
#         if i % 100 == 0:
#             print("epoch",epoch,"iteration",i,"loss",loss.item())
            
#         #存模型
#         if i % 9500 == 0:
#             val_loss = evaluate(model,val_iter)
#             if len(val_losses)==0 or val_loss < min(val_losses):
#                 torch.save(model.state_dict(),"lm.pth")
#                 print("save to lm.pth")
#             else:
#                 # learning rate decay
#                 scheduler.step()
#             val_losses.append(val_loss)

epoch 0 iteration 0 loss 10.82917594909668
save to lm.pth
epoch 0 iteration 100 loss 7.412539005279541
epoch 0 iteration 200 loss 7.053678512573242
epoch 0 iteration 300 loss 7.3103928565979
epoch 0 iteration 400 loss 7.003200054168701
epoch 0 iteration 500 loss 6.818564414978027
epoch 0 iteration 600 loss 6.732349395751953
epoch 0 iteration 700 loss 6.72703742980957
epoch 0 iteration 800 loss 6.524006366729736
epoch 0 iteration 900 loss 6.771305561065674
epoch 0 iteration 1000 loss 6.825018405914307
epoch 0 iteration 1100 loss 6.512195587158203
epoch 0 iteration 1200 loss 6.399179458618164
epoch 0 iteration 1300 loss 6.504211902618408
epoch 0 iteration 1400 loss 6.193819046020508
epoch 0 iteration 1500 loss 6.32082462310791
epoch 0 iteration 1600 loss 6.163266181945801
epoch 0 iteration 1700 loss 6.407052993774414
epoch 0 iteration 1800 loss 6.464570999145508
epoch 0 iteration 1900 loss 6.444336414337158
epoch 0 iteration 2000 loss 6.4136881828308105
epoch 0 iteration 2100 loss 6.3824

epoch 1 iteration 8200 loss 5.2688751220703125
epoch 1 iteration 8300 loss 5.1887431144714355
epoch 1 iteration 8400 loss 5.69419002532959
epoch 1 iteration 8500 loss 5.4859418869018555
epoch 1 iteration 8600 loss 5.482663154602051
epoch 1 iteration 8700 loss 5.378697395324707
epoch 1 iteration 8800 loss 5.379619121551514
epoch 1 iteration 8900 loss 5.718132972717285
epoch 1 iteration 9000 loss 5.391956806182861
epoch 1 iteration 9100 loss 5.498380184173584
epoch 1 iteration 9200 loss 5.2552995681762695
epoch 1 iteration 9300 loss 5.188258171081543
epoch 1 iteration 9400 loss 5.246530055999756
epoch 1 iteration 9500 loss 5.611937046051025
save to lm.pth


以下操作相当于将batch_text_size()这个元组里的元素拆开之后再相乘

In [25]:
np.multiply(*batch.text.size())

672

### 将模型load回来

In [27]:
best_model = LSTMModel(vocab_size=len(TEXT.vocab),
                 embed_size=EMBEDDING_SIZE,
                 hidden_size=HIDDEN_SIZE)
if USE_CUDA:
    best_model = best_model.to(device)
best_model.load_state_dict(torch.load("lm.pth"))

AttributeError: 'Field' object has no attribute 'vocab'

![QQ截图20200227164812.png](.\image\QQ截图20200227164812.png)

![QQ截图20200227165000.png](.\image\QQ截图20200227165000.png)
![QQ截图20200228141648.png](.\image\QQ截图20200228141648.png)
### multinomial表示logits越大该词被sampling的概率越大，也可以将它改为argmax

![QQ截图20200228143548.png](.\image\QQ截图20200228143548.png)
![QQ截图20200228144055.png](.\image\QQ截图20200228144055.png)

In [24]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field()
# TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

![QQ截图20200228221206.png](.\image\QQ截图20200228221206.png)

In [34]:
from torchtext import datasets
train_data,test_data = datasets.IMDB.splits(TEXT,LABEL)

查看每个数据split有多少条数据

In [35]:
print(f'Number of training examples:{len(train_data)}')
print(f'Number of testing examples:{len(test_data)}')

Number of training examples:0
Number of testing examples:0


查看一个example

In [28]:
print(vars(train_data.examples[0]))

IndexError: list index out of range

![QQ截图20200228223514.png](.\image\QQ截图20200228223514.png)
![QQ截图20200228223553.png](.\image\QQ截图20200228223553.png)
![QQ截图20200228224248.png](.\image\QQ截图20200228224248.png)

In [30]:
import random
train_data,valid_data = train_data.split(random_state=random.seed(SEED))

ValueError: not enough values to unpack (expected 2, got 0)

检查一下现在每个部分有多少条数据

In [31]:
print(f'Number of training examples:{len(train_data)}')
print(f'Number of validation examples:{len(valid_data)}')
print(f'Number of testing examples:{len(test_data)}')

Number of training examples:0


NameError: name 'valid_data' is not defined

![QQ截图20200228224506.png](.\image\QQ截图20200228224506.png)

In [32]:
#TEXT.build_vocab(train_data,max_size=25000)
#LABEL.build_vocab(train_data)
TEXT.build_vocab(train_data,max_size=25000,vectors="glove.6B.100d",unk_init=torch.Tensor.normal_)#这里的glove是预训练的自向量，这里把它带进来可以使其训练的速度要快很多
LABEL.build_vocab(train_data)

100%|██████████████████████████████████████████████████████████████████████▉| 399999/400000 [00:16<00:00, 24964.05it/s]


In [33]:
print(f"Unique tokens in TEXT vocabulary:{len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary:{len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary:2
Unique tokens in LABEL vocabulary:0


![QQ截图20200228224926.png](.\image\QQ截图20200228224926.png)
![QQ截图20200228225223.png](.\image\QQ截图20200228225223.png)
![QQ截图20200228225253.png](.\image\QQ截图20200228225253.png)
![QQ截图20200228225315.png](.\image\QQ截图20200228225315.png)

In [None]:
#设置批次大小
BATCH_SIZE=64
#判断使用cpu还是gpu进行模型训练
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#句子迭代器的生成，其中BucetIterator表示将长度差不多的词语放在一起
train_iterator,valid_iterator,test_iterator = data.BucketIterator.splits( 
    (train_data,valid_data,test_data),
    batch_size = BATCH_SIZE,
    device = device)
#seq_len * batch_size

In [None]:
batch = next(iter(valid_iterator))

![QQ截图20200229193455.png](.\image\QQ截图20200229193455.png)
![QQ截图20200229193939.png](.\image\QQ截图20200229193939.png)
![QQ截图20200229195200.png](.\image\QQ截图20200229195200.png)
![QQ截图20200229195306.png](.\image\QQ截图20200229195306.png)

In [3]:
import torch
import torch.nn as nn
import torch.functional as F

class WordAVGModel(nn.Module):
    def __init__(self,vocab_size,embedding_size,output_size,pad_idx):
        super(WordAVGModel,self).__init__()
        self.embed = nn.Embedding(vocab_size,embedding_size,padding_idx=pad_idx)#这里的pad_idx是指当模型遇到某个index的时候输出padding向量，实际上padding向量就是一个0向量
        self.linear = nn.Linear(embedding_size,output_size)
        
    def forward(self,text):
        embedded = self.embed(text) #[seq_len,batch_size,embedding_size]
#         embeded = embeded.transpose(1,0) #[batch_size,seq_len,embedding_size]
        #或者：
        embedded = embedded.permute(1,0,2) #[batch_size,seq_len,embedding_size]
        pooled = F.avg_pool2d(embedded,(embedded.shape[1],1)).squeeze() #avg_pool2d的第二个参数中的第一个参数表示将其压扁成embedded.shape[1]的大小，第二个参数表示不去动他。此时shape会变成[batch_size,1,embedding_size],要将这个1去掉我们用squeeze()
        return self.linear(pooled)

In [44]:
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_SIZE = 100
OUTPUT_SIZE = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordAVGModel(vocab_size=VOCAB_SIZE,
                    embedding_size=EMBEDDING_SIZE,
                    output_size=OUTPUT_SIZE,
                    pad_idx=PAD_IDX)

In [16]:
#model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)#numel() is the sum of basic parameters
count_parameters(model)

5000301

In [22]:
next(model.parameters()).numel()

5000200

![QQ截图20200229212405.png](.\image\QQ截图20200229212405.png)
这里加入了词向量glove（斯坦福训练的优质词向量）之后模型的收敛会快很多

In [27]:
pretrained_embedding = TEXT.vocab.vectors #这个embedding是被glove初始化的embedding

UNK_IDX = TEXT.vocab.stoi(TEXT.unk_token)
#.copy_()带“_”的方法都是inplace的方法，会将原数据替换掉
model.embed.weight.data.copy_(pretrained_embedding)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)

### 训练模型
计算预测的准确率

In [None]:
optimizer =torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss() #the expression is a binary cross entropy, "logits" means output data has not been sigmoid,if data has been sigmoid, we call it "probability"  rather "logits"

model = model.to(device)
crit = crit.to(device)

In [None]:
#计算准确率
def binary_accuracy(preds,y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [29]:
def train(model,iterator,optimizer,crit):
    epoch_loss,epoch_acc,total_len = 0.,0.,0.
    model.train()
    for batch in iterator:
        preds = model(batch.text).squeeze()#因为模型默认size为[batch_size ,1]所以需要squeeze()
        loss = crit(preds,batch.label)
        acc = binary_accuracy(preds,batch.label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)        
        total_len += len(batch.label)
        
    return epoch_loss / total_len,epoch_acc / total_len

In [32]:
#evaluate does not need optimizer
def evaluate(model,iterator,crit):
    epoch_loss,epoch_acc,total_len = 0.,0.,0.
    epoch_loss,epoch_acc = 0.,0.
    model.eval()
    for batch in iterator:
        preds = model(batch.text).squeeze()#因为模型默认size为[batch_size ,1]所以需要squeeze()
        loss = crit(preds,batch.label)
        acc = binary_accuracy(preds,batch.label)
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
    model.train()
        
    return epoch_loss / total_len, epoch_acc / total_len

In [None]:
N_EPOCHS = 10
best_valid_acc = 0.
for epoch in range(N_EPOCHS):
    train_loss,train_acc = train(model,train_iterator,optimizer,crit)
    valid_loss,valid_acc = evaluate(model,valid_iterator,crit)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(),"wordavg-model.pth")
    print("Epoch",epoch,"Train Loss",train_loss,"Train Acc",train_acc)
    print("Epoch",epoch,"Valid Loss",valid_loss,"Valid Acc",valid_acc)    

In [2]:
import spacy
slp = spacy.load("en")

![QQ截图20200301143004.png](.\image\QQ截图20200301143004.png)
![QQ截图20200301143153.png](.\image\QQ截图20200301143153.png)

In [49]:
class LSTMModel(nn.Module):
    #初始化
    def __init__(self,vocab_size,embedding_size,output_size,pad_idx,hidden_size,dropout):
        super(LSTMModel,self).__init__()
        #嵌入层
        self.embed = nn.Embedding(vocab_size,embedding_size,padding_idx=pad_idx)
        #参数bidirectional是指此同时训练两个lstm，一个正向一个负向
        self.lstm = nn.LSTM(embedding_size,hidden_size,num_layers=2,bidirectional=True)
        #线性层
        self.linear = nn.Linear(hidden_size,output_size)
        #dropout：随机将一些信息丢弃，防止模型过拟合
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,text):
        #[seq_len,batch_size,embedding_size]
        embedded = self.embed(text) 
        embedded = self.dropout(embedded)
        #这里hidden不传的话会默认传入全零的向量
        output,(hidden,cell) = self.lstm(embedded)
        
        #hidden:[2 * batch_size * hidden_size]
        hidden = torch.cat([hidden[-1],hidden[-2]],dim=1)
        #squeeze():将hidden层值全为1的无用维度删去
        hidden = self.dropout(hidden.squeeze())
        
        return self.linear(hidden)

In [50]:
model = RNNModel(vocab_size=VOCAB_SIZE,
                 embedding_size=EMBEDDING_SIZE,
                 output_size=OUTPUT_SIZE,
                 pad_idx=PAD_IDX,
                 hidden_size=HIDDEN_SIZE,
                 dropout=0.5)

In [None]:
TEXT.build_vocab(train_data,max_size=25000,vectors="glove.6B.100d",unk_init=torch.Tensor.normal_)#这里的glove是预训练的自向量，这里把它带进来可以使其训练的速度要快很多
LABEL.build_vocab(train_data)

pretrained_embedding = TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)

In [None]:
optimizer =torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss() #the expression is a binary cross entropy, "logits" means output data has not been sigmoid,if data has been sigmoid, we call it "probability"  rather "logits"

model = model.to(device)
crit = crit.to(device)

In [None]:
N_EPOCH = 10
best_valid_acc = 0.
for epoch in range(N_EPOCH):
    train_loss,train_acc = train(model,train_iterator,optimizer,crit)
    valid_loss,valid_acc = evaluate(model,valid_iterator,crit)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(),"lstm-model.pth")
    print("Epoch",epoch,"Train Loss",train_loss,"Train Acc",train_acc)
    print("Epoch",epoch,"Valid Loss",valid_loss,"Valid Acc",valid_acc)    

In [10]:
# INPUT_DIM = len(TEXT.vocab) #输入层大小
# EMBEDDING_DIM = 100 #词嵌入层大小
# HIDDEN_DIM = 256 #隐藏层大小
# OUTPUT_DIM = 1 #输出层大小
# N_LAYERS = 2 #隐藏层叠加层数
# BIDIRECTIONAL = True #设置双向LSTM
# DROPOUT = 0.5 #设置信息保留百分比，防止过拟合
# PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] #长度过短单词转换成对应下标

# model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
#             N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX) #模型的初始化

In [11]:
# pretrained_embeddings = TEXT.vocab.vectors #未经过训练的原始词向量
# model.embedding.weight.data.copy_(pretrained_embeddings) #模型词嵌入层初始化
# UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] #低频词语转换成对应下标

# #模型词嵌入层低频单词词向量初始化
# model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) 
# #模型词嵌入层长度过短单词词向量初始化
# model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
#句子输入接口
def predict_sentiment(sentence):
    #将句子进行分割
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    #使用stoi函数将文本转化成建立的词数统计字典中对应的下标值
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    #将该下标值数组变为张量
    tensor = torch.LongTensor(indexed)
    #为该张量第一个维度上创建一个全是数字1的维度，此时它的大小将变为[len(tensor),2]
    tensor = tensor.unsqueeze(1)
    #对模型进行预测，并使用sigmoid函数将模型的输出值转变到可控的数值范围内
    prediction = torch.sigmoid(model(tensor))
    #返回预测的概率
    return prediction.item()

Test Loss:0.337 | Test Acc: 86.73%

![QQ截图20200301161515.png](.\image\QQ截图20200301161515.png)

### CNN模型

In [None]:
class CNN(nn.Module):
    def __init__(self,vocab_size,embedding_size,output_size,pad_idx,dropout,num_filters,filter_size):
        super(CNN,self).__init__()
        self.embed = nn.Embedding(vocab_size,embedding_size,padding_idx=pad_idx)
        self.conv = nn.Conv2d(in_channels=1,out_channels=num_filters,kernel_size=(filter_size,embedding_size))
        self.linear = nn.Linear(num_filters,output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,text):
        text = text.permute(1,0) #[batch_size,seq_len]
        embedded = self.embed(text)#[batch_size,seq_len,embedding_size]
        embedded = embedded.unsqueeze(1)#[batch_size,1,seq_len,embedding_size]
        conved = F.relu(self.conv(embedded))#[batch_size,num_filters,seq_len-filter_size+1,1]
        conved = conved.squeeze(3)#[batch_size,num_filters,seq_len-filter_size+1]
        # max over time pooling
        pooled = F.max_pooled(conved,conved.shape[2])#[batch_size,num_filters,1]
        pooled = pooled.squeeze(2)
        pooled = self.dropout(pooled)
        
        return self.linear(pooled)

In [None]:
#改进并联多个convd
class CNN(nn.Module):
    def __init__(self,vocab_size,embedding_size,output_size,pad_idx,dropout,num_filters,filter_sizes):
        super(CNN,self).__init__()
        self.embed = nn.Embedding(vocab_size,embedding_size,padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,out_channels=num_filters,
                     kernel_size=(fs,embedding_size))
            for fs in filter_sizes
        ]) # 3个CNN
#         self.conv = nn.Conv2d(in_channels=1,out_channels=num_filters,kernel_size=(filter_size,embedding_size))
        self.linear = nn.Linear(embedding_size,output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,text):
        text = text.permute(1,0) #[batch_size,seq_len]
        embedded = self.embed(text)#[batch_size,seq_len,embedding_size]
        embedded = embedded.unsqueeze(1)#[batch_size,1,seq_len,embedding_size]
#         conved = F.relu(self.conv(embedded))#[batch_size,num_filters,seq_len-filter_size+1,1]
#         conved = conved.squeeze(3)#[batch_size,num_filters,seq_len-filter_size+1]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        # max over time pooling
#         pooled = F.max_pooled(conved,conved.shape[2])#[batch_size,num_filters,1]
#         pooled = pooled.squeeze(2)
        pooled = [F.max_pooled(conv,conv.shape[2]) for conv in conved]
        pooled = torch.cat(pooled,dim=1) #batch_size,3*num_filters
        pooled = self.dropout(pooled)
        
        return self.linear(pooled)

In [None]:
model = CNN(vocab_size=VOCAB_SIZE,
           embedding_size=EMBEDDING_SIZE,
           output_size=OUTPUT_SIZE,
           pad_idx=PAD_IDX,
           num_filters=100,
           filter_sizes=[3,4,5],
           dropout=0.5)

pretrained_embedding = TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)

optimizer =torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss() #the expression is a binary cross entropy, "logits" means output data has not been sigmoid,if data has been sigmoid, we call it "probability"  rather "logits"

model = model.to(device)
crit = crit.to(device)

N_EPOCH = 10
best_valid_acc = 0.
for epoch in range(N_EPOCH):
    train_loss,train_acc = train(model,train_iterator,optimizer,crit)
    valid_loss,valid_acc = evaluate(model,valid_iterator,crit)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(),"cnn-model.pth")
    print("Epoch",epoch,"Train Loss",train_loss,"Train Acc",train_acc)
    print("Epoch",epoch,"Valid Loss",valid_loss,"Valid Acc",valid_acc)

hierarchicalLSTM

In [None]:
model.load_state_dict(torch.load("cnn-model.pth"))
test_loss,test_acc = evaluate(model,test_iterator,crit)
print("cnn model test loss: ",test_loss,"accuracy: "test_acc)

In [7]:
{'text': ['it','is','brilliant'],'label':'pos'}

{'text': ['it', 'is', 'brilliant'], 'label': 'pos'}

In [8]:
{'text': 'It is brilliant','label':'pos'}

{'text': 'It is brilliant', 'label': 'pos'}