In [0]:
import torch
from torchtext import data

In [0]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)


In [0]:
from torchtext import datasets

In [7]:
train_data, test_data = datasets.IMDB.splits(TEXT,LABEL)

aclImdb_v1.tar.gz:   0%|          | 164k/84.1M [00:00<00:57, 1.47MB/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 65.5MB/s]


In [8]:
print(len(train_data))
print(len(test_data))

25000
25000


In [9]:
print(vars(train_data.examples[0]))

{'text': ['This', 'movie', 'makes', 'a', 'statement', 'about', 'Joseph', 'Smith', ',', 'what', 'he', 'stood', 'for', ',', 'and', 'what', 'the', 'LDS', 'church', 'believes', '.', 'With', 'all', 'the', 'current', 'media', 'coverage', 'of', 'a', 'certain', 'fugitive', 'people', 'have', 'confused', 'the', 'LDS', 'church', 'with', 'the', 'FLDS', 'church', 'and', 'criminal', 'fugitive', 'Warren', 'Jeffs', '.', 'Jeffs', 'is', 'Not', 'associated', 'with', 'the', 'LDS', 'church', 'yet', 'media', 'groups', 'internationally', 'have', 'asked', 'for', 'comments', 'about', 'Jeffs', 'from', 'The', 'LDS', 'church', '.', 'Jeffs', 'is', 'not', 'mentioned', 'in', 'the', 'movie', 'at', 'all', 'but', 'I', 'think', 'that', 'it', 'is', 'ironic', 'that', 'this', 'movie', 'with', 'all', 'it', "'s", 'points', 'about', 'Joseph', 'also', 'point', 'away', 'from', 'the', 'fews', 'of', 'the', 'FLDS', 'church', 'and', 'their', 'leader', 'at', 'this', 'time', 'in', 'the', 'media', 'world', '.', 'This', 'is', 'a', 'mov

In [0]:
import random
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [11]:
print(len(train_data))

17500


In [12]:
TEXT.build_vocab(train_data,max_size=25000,vectors='glove.6B.100d',unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:26, 2.23MB/s]                           
100%|█████████▉| 399630/400000 [00:14<00:00, 26631.42it/s]

In [13]:
print(len(TEXT.vocab))
print(len(LABEL.vocab))

25002
2


In [15]:
print(TEXT.vocab.freqs.most_common(10))

[('the', 203186), (',', 192259), ('.', 166049), ('and', 109479), ('a', 109166), ('of', 101051), ('to', 93837), ('is', 76139), ('in', 61352), ('I', 54641)]


In [16]:
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
defaultdict(<function _default_unk_index at 0x7ff197be1268>, {'neg': 0, 'pos': 1})


In [0]:
BATCH_SIZE = 64

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data,valid_data,test_data),
    batch_size=BATCH_SIZE,
    device=device
)

In [0]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, padding_idx, hidden_dim, num_layers, bidirectional, dropout, output_dim):
    super(RNN,self).__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim,padding_idx=padding_idx)
    self.rnn = nn.LSTM(embedding_dim,hidden_dim,num_layers=num_layers,bidirectional=bidirectional,dropout=dropout)
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(hidden_dim*2,output_dim)
    self.hidden_dim = hidden_dim
    self.num_layers = num_layers

  def forward(self, text, hidden):
    # embedded=[sentence_len,batch_size,embedding_dim]
    embedded = self.dropout(self.embedding(text))
    # output=[sentence_len,batch_size,hidden_dim*num_direction]
    # hidden=[num_layers*num_direction,batch_size,hidden_dim]
    # cell=[num_layers*num_direction,batch_size,hidden_dim]
    oupput, (hidden, cell) = self.rnn(embedded, hidden)
    #pooled=[batch_size,hidden_dim*num_direction]
    pooled = self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1))
    return self.fc(pooled), (hidden, cell)

  def init_hidden(self,BATCH_SIZE,requires_grad=True):
    weight = next(self.parameters())
    return (weight.new_zeros((self.num_layers*2,BATCH_SIZE,self.hidden_dim),requires_grad=requires_grad),
        weight.new_zeros((self.num_layers*2,BATCH_SIZE,self.hidden_dim),requires_grad=requires_grad))

In [0]:
vocab_size = len(TEXT.vocab)
embedding_dim = 100
padding_idx = TEXT.vocab.stoi[TEXT.pad_token]
hidden_dim = 256
num_layers = 2
bidirectional = True
dropout = 0.5 
output_dim = 1

In [0]:
model = RNN(vocab_size, embedding_dim, padding_idx, hidden_dim, num_layers, bidirectional, dropout, output_dim)

In [64]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0259,  0.2626, -0.4150,  ...,  0.2496,  1.0473, -0.8566],
        [-0.3556, -0.2554, -0.1192,  ..., -0.1305,  0.1196, -0.4377],
        [ 0.1901, -0.2545, -0.3214,  ...,  0.1637,  0.2881, -0.1090]])

In [0]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(embedding_dim)
model.embedding.weight.data[PAD_IDX] = torch.zeros(embedding_dim)

训练模型

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion =criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float()
  acc = correct.sum() / len(correct)
  return acc

In [0]:
# Remove this part
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [0]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  hidden = model.init_hidden(BATCH_SIZE)

  for batch in iterator:
    if len(batch.text[1,:]) == 64:
      optimizer.zero_grad()
      hidden = repackage_hidden(hidden)
      predictions, hidden = model(batch.text,hidden)
      predictions = predictions.squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)
      loss.backward()
      optimizer.step()

      epoch_loss += loss.item()
      epoch_acc += acc.item()
    else:
      continue

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  hidden = model.init_hidden(BATCH_SIZE)

  with torch.no_grad():
    for batch in iterator:
      if len(batch.text[1,:]) == 64:
        hidden = repackage_hidden(hidden)
        predictions, hidden = model(batch.text, hidden)
        predictions = predictions.squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        epoch_loss += loss.item()
        epoch_acc += acc.item()
      else:
        continue
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
N_EPOCHS = 5

In [0]:
best_valid_loss = float('inf')

In [68]:
for epoch in range(N_EPOCHS):
  train_loss,train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss,valid_acc = evaluate(model, valid_iterator, criterion)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(),'lstm-model.pt')

  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
  print('......')

	Train Loss: 0.672 | Train Acc: 56.99%
	 Val. Loss: 0.677 |  Val. Acc: 57.27%
......
	Train Loss: 0.516 | Train Acc: 75.26%
	 Val. Loss: 0.610 |  Val. Acc: 69.86%
......
	Train Loss: 0.409 | Train Acc: 82.28%
	 Val. Loss: 0.506 |  Val. Acc: 72.10%
......
	Train Loss: 0.320 | Train Acc: 86.50%
	 Val. Loss: 0.485 |  Val. Acc: 74.62%
......


In [0]:
import spacy
nlp = spacy.load('en')

In [0]:
def predict_sentiment(sentence):
  hidden = model.init_hidden(1)
  tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
  indexed = [TEXT.vocab.stoi[t] for t in tokenized]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)
  output, hidden = model(tensor,hidden)
  prediction = torch.sigmoid(output)
  return prediction.item()

In [84]:
predict_sentiment("This film is great")


0.9911580681800842

In [85]:
predict_sentiment("This film is terrible")

0.02054463140666485