In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
path =  "/content/drive/My Drive/Movie_Reviews/csv"

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim
import torchtext
import spacy
import time
from torchtext.data import Field, BucketIterator, TabularDataset, LabelField

In [4]:
torch.cuda.is_available()

True

In [0]:
SEED = 1994
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
train_df = pd.read_csv(f"{path}/train.csv")
test_df = pd.read_csv(f"{path}/test.csv")

In [8]:
print(f'Number of training examples: {len(train_df)}')
print(f'Number of testing examples: {len(test_df)}')

Number of training examples: 25000
Number of testing examples: 25000


In [9]:
train_split, val_split = train_test_split(train_df, test_size=0.3, random_state=1994)
train_split.shape, val_split.shape

((17500, 2), (7500, 2))

In [0]:
train_split.to_csv(f"{path}/train_split.csv", index=False)
val_split.to_csv(f"{path}/val_split.csv", index=False)

In [0]:
en = spacy.load('en')
def tokenize(sentence):
  return [tok.text for tok in en.tokenizer(sentence)]
spacy_tokenizer = Field(tokenize=tokenize, include_lengths=True)
LABEL = LabelField(dtype = torch.float)

In [12]:
tokenize("i don't love you")

['i', 'do', "n't", 'love', 'you']

In [0]:
# associate the text in the 'English' column with the EN_TEXT field, #
data_fields = [('reviews', spacy_tokenizer), ('pos_or_neg', LABEL)]
train, val, test = TabularDataset.splits(path=path, train='train_split.csv', validation='val_split.csv',
                                         test='test.csv', format='csv', skip_header=True, fields=data_fields)

In [14]:
print(f'Number of training examples: {len(train)}')
print(f'Number of testing examples: {len(val)}')
print(f'Number of training examples: {len(test)}')

Number of training examples: 17500
Number of testing examples: 7500
Number of training examples: 25000


In [15]:
print(vars(train.examples[0])), print(vars(val.examples[0])), print(vars(test.examples[0]))

{'reviews': ['When', 'will', 'the', 'hurting', 'stop', '?', 'I', 'never', 'want', 'to', 'see', 'another', 'version', 'of', 'a', 'Christmas', 'Carol', 'again', '.', 'They', 'keep', 'on', 'making', 'movies', 'with', 'the', 'same', 'story', ',', 'falling', 'over', 'each', 'other', 'in', 'trying', 'to', 'make', 'the', 'movie', 'better', 'then', 'the', 'rest', ',', 'but', 'sadly', 'fail', 'to', 'do', 'so', ',', 'as', 'this', 'is', 'not', 'a', 'good', 'story', '.', 'Moralistic', ',', 'old', '-', 'fashioned', ',', 'conservative', 'happy', '-', 'thinking', '.', 'As', 'if', 'people', 'learn', '.', 'The', 'numerous', 'different', 'versions', 'of', 'this', 'film', 'prove', 'that', 'we', 'don´t', '.'], 'pos_or_neg': '0'}
{'reviews': ['Kol', ',', 'space', 'prisoner', 'on', 'space', 'death', 'row', ',', 'manages', 'to', 'hijack', 'a', 'space', 'shuttle', 'and', 'escape', 'to', 'the', 'woods', 'of', 'America', 'where', 'he', ',', 'along', 'with', 'some', 'new', 'found', 'friend', 'try', 'to', 'escape

(None, None, None)

In [0]:
MAX_VOCAB_SIZE = 25000

In [17]:
spacy_tokenizer.build_vocab(train, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train)
print(f"Unique tokens in TEXT vocabulary: {len(spacy_tokenizer.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [18]:
print(spacy_tokenizer.vocab.freqs.most_common(20))

[('the', 202228), (',', 191811), ('.', 166104), ('a', 109545), ('and', 109108), ('of', 100261), ('to', 93586), ('is', 76219), ('in', 61112), ('I', 53952), ('it', 53535), ('that', 49080), ('"', 44472), ("'s", 43144), ('this', 42114), ('-', 36966), ('/><br', 35439), ('was', 34960), ('as', 30261), ('with', 30017)]


In [19]:
print(spacy_tokenizer.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


In [20]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f1810e9e840>, {'0': 0, '1': 1})


In [0]:
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, val, test),
                                      sort_key=lambda x: len(x.reviews), batch_size=BATCH_SIZE, device=device, sort_within_batch=True)

In [0]:
class CustomLSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_rnn_layers, bidirectional, dropout, pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, pad_idx)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_rnn_layers, bidirectional=bidirectional, dropout=dropout)
    if bidirectional:
      self.fc = nn.Linear(2*hidden_dim, output_dim)
    else:
      self.fc = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text, text_lengths):
    #text -> [Sentence Length, batch_size]
    embedded = self.dropout(self.embedding(text))
    #embedded will be [sentence length, batch_size, embedding_dim]
    #pack sequence
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
    packed_output, (hidden , cell) = self.lstm(packed_embedded)

    #unpack sequence
    output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
    #output -> [sentence length, batch size, hidden dimensions*num directions]
    #hidden = [num layers * num directions, batch size, hid dim]
    #cell = [num layers * num directions, batch size, hid dim]
        
    #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers and apply dropout
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))

    #hidden = [batch size, hid dim * num directions]
            
    return self.fc(hidden)

  def predict(self, text, text_lengths):
    return self(text, text_lengths).squeeze(1)

In [0]:
INPUT_DIM = 25002
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = spacy_tokenizer.vocab.stoi[spacy_tokenizer.pad_token]
model = CustomLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

In [73]:
def count_trainable_parameters(model):
  return sum([p.numel() for p in model.parameters() if p.requires_grad])
print(f"The model has {count_trainable_parameters(model)} trainable parameters")

The model has 4810857 trainable parameters


In [74]:
pretrained_embeddings = spacy_tokenizer.vocab.vectors
pretrained_embeddings.shape

torch.Size([25002, 100])

In [75]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.8663, -0.1668, -0.3368,  ...,  0.3293, -0.4030, -0.7492],
        [-0.0711, -1.6034, -0.4121,  ...,  2.0934, -0.0031, -0.3947],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.5610,  0.2996,  0.2498,  ..., -0.6200,  0.1011,  0.9162],
        [ 0.0701,  0.4941,  0.1676,  ...,  0.1009, -0.4823,  0.4224],
        [ 2.6203, -0.4006, -0.8117,  ..., -2.3390,  0.5394, -0.2738]])

In [76]:
UNK_IDX = spacy_tokenizer.vocab.stoi[spacy_tokenizer.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.5610,  0.2996,  0.2498,  ..., -0.6200,  0.1011,  0.9162],
        [ 0.0701,  0.4941,  0.1676,  ...,  0.1009, -0.4823,  0.4224],
        [ 2.6203, -0.4006, -0.8117,  ..., -2.3390,  0.5394, -0.2738]])


In [0]:
optimizer = optim.Adam(model.parameters())
loss_criterion = nn.BCEWithLogitsLoss()

In [0]:
model = model.to(device)
loss_criterion = loss_criterion.to(device)

In [0]:
def binary_accuracy(preds, target):
  """Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8"""
  #rounds prediction to the closest integer.
  rounded_prediction = torch.round(torch.sigmoid(preds))
  correct = (rounded_prediction == target).float()
  acc = correct.sum()/len(correct)
  return acc

In [0]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    optimizer.zero_grad()
    reviews, review_length = batch.reviews
    predictions = model.predict(reviews, review_length)#.squeeze(1)
    loss = criterion(predictions, batch.pos_or_neg)
    acc = binary_accuracy(predictions, batch.pos_or_neg)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  #with torch.no_grad():
  for batch in iterator:
    reviews, review_length = batch.reviews
    predictions = model.predict(reviews, review_length)#.squeeze(1)
    loss = criterion(predictions, batch.pos_or_neg)
    acc = binary_accuracy(predictions, batch.pos_or_neg)
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [83]:
num_epochs = 5
best_valid_loss = float('inf')
for epoch in range(num_epochs):
  start_time = time.time()
  train_loss, train_acc = train(model, train_iterator, optimizer, loss_criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator,  loss_criterion)
  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
  if  valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'tut2-model.pt')
  
  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 42s
	Train Loss: 0.657 | Train Acc: 60.41%
	 Val. Loss: 0.569 |  Val. Acc: 71.56%
Epoch: 02 | Epoch Time: 0m 42s
	Train Loss: 0.558 | Train Acc: 71.62%
	 Val. Loss: 0.405 |  Val. Acc: 82.65%
Epoch: 03 | Epoch Time: 0m 42s
	Train Loss: 0.435 | Train Acc: 80.56%
	 Val. Loss: 0.390 |  Val. Acc: 83.33%
Epoch: 04 | Epoch Time: 0m 42s
	Train Loss: 0.345 | Train Acc: 85.38%
	 Val. Loss: 0.399 |  Val. Acc: 84.20%
Epoch: 05 | Epoch Time: 0m 42s
	Train Loss: 0.302 | Train Acc: 87.67%
	 Val. Loss: 0.284 |  Val. Acc: 88.36%


In [84]:
model.load_state_dict(torch.load('tut2-model.pt'))
test_loss, test_acc = evaluate(model, valid_iterator, loss_criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.284 | Test Acc: 88.36%


In [0]:
import spacy

def predict_sentiment(model, sentence):
  model.eval()
  tokenized = tokenize(sentence)
  indexed = [spacy_tokenizer.vocab.stoi[t] for t in tokenized]
  length = [len(indexed)]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)
  length_tensor = torch.LongTensor(length)
  prediction = torch.sigmoid(model.predict(tensor, length_tensor))
  return prediction.item()

In [104]:
#Negative Review
predict_sentiment(model, "This flim is terrible")

0.007143225520849228

In [105]:
#Positive Review
predict_sentiment(model, "This flim is great")

0.9603269696235657