In [39]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
path =  "/content/drive/My Drive/Movie_Reviews/csv"

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim
import torchtext
import spacy
import time
from torchtext.data import Field, BucketIterator, TabularDataset, LabelField

In [5]:
torch.cuda.is_available()

True

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
train_df = pd.read_csv(f"{path}/train.csv")
test_df = pd.read_csv(f"{path}/test.csv")

In [8]:
print(f'Number of training examples: {len(train_df)}')
print(f'Number of testing examples: {len(test_df)}')

Number of training examples: 25000
Number of testing examples: 25000


In [9]:
train_split, val_split = train_test_split(train_df, test_size=0.3, random_state=1994)
train_split.shape, val_split.shape

((17500, 2), (7500, 2))

In [0]:
train_split.to_csv(f"{path}/train_split.csv", index=False)
val_split.to_csv(f"{path}/val_split.csv", index=False)

In [0]:
en = spacy.load('en')
def tokenize(sentence):
  return [tok.text for tok in en.tokenizer(sentence)]
spacy_tokenizer = Field(tokenize=tokenize)
LABEL = LabelField(dtype = torch.float)

In [12]:
tokenize("i don't love you")

['i', 'do', "n't", 'love', 'you']

In [0]:
# associate the text in the 'English' column with the EN_TEXT field, #
data_fields = [('reviews', spacy_tokenizer), ('pos_or_neg', LABEL)]
train, val, test = TabularDataset.splits(path=path, train='train_split.csv', validation='val_split.csv',
                                         test='test.csv', format='csv', skip_header=True, fields=data_fields)

In [14]:
print(f'Number of training examples: {len(train)}')
print(f'Number of testing examples: {len(val)}')
print(f'Number of training examples: {len(test)}')

Number of training examples: 17500
Number of testing examples: 7500
Number of training examples: 25000


In [15]:
print(vars(train.examples[0])), print(vars(val.examples[0])), print(vars(test.examples[0]))

{'reviews': ['When', 'will', 'the', 'hurting', 'stop', '?', 'I', 'never', 'want', 'to', 'see', 'another', 'version', 'of', 'a', 'Christmas', 'Carol', 'again', '.', 'They', 'keep', 'on', 'making', 'movies', 'with', 'the', 'same', 'story', ',', 'falling', 'over', 'each', 'other', 'in', 'trying', 'to', 'make', 'the', 'movie', 'better', 'then', 'the', 'rest', ',', 'but', 'sadly', 'fail', 'to', 'do', 'so', ',', 'as', 'this', 'is', 'not', 'a', 'good', 'story', '.', 'Moralistic', ',', 'old', '-', 'fashioned', ',', 'conservative', 'happy', '-', 'thinking', '.', 'As', 'if', 'people', 'learn', '.', 'The', 'numerous', 'different', 'versions', 'of', 'this', 'film', 'prove', 'that', 'we', 'don´t', '.'], 'pos_or_neg': '0'}
{'reviews': ['Kol', ',', 'space', 'prisoner', 'on', 'space', 'death', 'row', ',', 'manages', 'to', 'hijack', 'a', 'space', 'shuttle', 'and', 'escape', 'to', 'the', 'woods', 'of', 'America', 'where', 'he', ',', 'along', 'with', 'some', 'new', 'found', 'friend', 'try', 'to', 'escape

(None, None, None)

In [0]:
MAX_VOCAB_SIZE = 25000

In [17]:
spacy_tokenizer.build_vocab(train, max_size=MAX_VOCAB_SIZE, include_lengths=True)
LABEL.build_vocab(train)
print(f"Unique tokens in TEXT vocabulary: {len(spacy_tokenizer.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [18]:
print(spacy_tokenizer.vocab.freqs.most_common(20))

[('the', 202228), (',', 191811), ('.', 166104), ('a', 109545), ('and', 109108), ('of', 100261), ('to', 93586), ('is', 76219), ('in', 61112), ('I', 53952), ('it', 53535), ('that', 49080), ('"', 44472), ("'s", 43144), ('this', 42114), ('-', 36966), ('/><br', 35439), ('was', 34960), ('as', 30261), ('with', 30017)]


In [19]:
print(spacy_tokenizer.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


In [20]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f79236e2840>, {'0': 0, '1': 1})


In [0]:
BATCH_SIZE = 512
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, val, test), batch_size = BATCH_SIZE, device=device, sort=False)

In [0]:
class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.rnn = nn.RNN(embedding_dim, hidden_dim)
    self.fc = nn.Linear(hidden_dim, output_dim)
        
  def forward(self, text):
    #text should be in the shape of Sentence Size, batch_length
    embedded = self.embedding(text)
    #embedding outputs  sentence size, batch length , embedding vector_dimensions
    output, hidden = self.rnn(embedded)
    #output is the concatenation of the hidden state from every time step(sent_len, batch_size, hidden_length), whereas hidden(1, batch_size, hidden_length)
    #is simply the final hidden state. We verify this using the assert statement. Note the squeeze method, which is used to remove a dimension of size 1.
    squeezed_hidden = hidden.squeeze(0)
    assert torch.equal(output[-1,:,:], squeezed_hidden)
    x = self.fc(squeezed_hidden)
    return x

  def predict(self, text):
    return self(text).squeeze(1)

In [0]:
INPUT_DIM = 25002
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [24]:
def count_trainable_parameters(model):
  return sum([p.numel() for p in model.parameters() if p.requires_grad])
print(f"The model has {count_trainable_parameters(model)} trainable parameters")

The model has 2592105 trainable parameters


In [0]:
optimizer = optim.SGD(model.parameters(), lr=10e-3)
loss_criterion = nn.BCEWithLogitsLoss()

In [0]:
model = model.to(device)
loss_criterion = loss_criterion.to(device)

In [0]:
def binary_accuracy(preds, target):
  """Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8"""
  #rounds prediction to the closest integer.
  rounded_prediction = torch.round(torch.sigmoid(preds))
  correct = (rounded_prediction == target).float()
  acc = correct.sum()/len(correct)
  return acc

In [0]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    optimizer.zero_grad()
    predictions = model.predict(batch.reviews) #.squeeze(1)
    loss = criterion(predictions, batch.pos_or_neg)
    acc = binary_accuracy(predictions, batch.pos_or_neg)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  #with torch.no_grad():
  for batch in iterator:
    predictions = model.predict(batch.reviews)#.squeeze(1)
    loss = criterion(predictions, batch.pos_or_neg)
    acc = binary_accuracy(predictions, batch.pos_or_neg)
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [36]:
num_epochs = 5
best_valid_loss = float('-inf')
for epoch in range(num_epochs):
  start_time = time.time()
  train_loss, train_acc = train(model, train_iterator, optimizer, loss_criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator,  loss_criterion)
  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
  if  best_valid_loss < valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'tut1-model.pt')
  
  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 50.35%
	 Val. Loss: 0.694 |  Val. Acc: 49.61%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 49.47%
	 Val. Loss: 0.693 |  Val. Acc: 50.46%
Epoch: 03 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 49.62%
	 Val. Loss: 0.693 |  Val. Acc: 50.46%
Epoch: 04 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 50.10%
	 Val. Loss: 0.693 |  Val. Acc: 50.46%
Epoch: 05 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 49.97%
	 Val. Loss: 0.694 |  Val. Acc: 49.61%


In [38]:
model.load_state_dict(torch.load('tut1-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.693 | Test Acc: 50.17%


In [33]:
ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/
