In [None]:
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import numpy as np
import matplotlib.pyplot as plt

# Wikipedia 2014 and Gigaword 5
glove = torchtext.vocab.GloVe(name="6B", dim= 100)   # embedding size = 100


In [None]:
#setup Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#load csv
def get_data():
  return csv.reader(open('/content/drive/My Drive/APS360 - AI Fundamentals/Project/Validated Samples.csv',"rt", encoding = "latin-1"))

for i, line in enumerate(get_data()):
    if line[1] != 'path':
        print(line[1], line[2])
        break

In [None]:
def split_phrase(phrase):
  # add spaces before and after punctuations to split text into words
  phrase = phrase.replace("."," . ") \
                  .replace(","," , ") \
                  .replace(";"," ; ") \
                  .replace("\""," \" ") \
                  .replace("!"," ! ")
  return phrase.lower().split()

split_phrase("test, this! out")    

In [None]:
#number of words in each phrase that have GloVe embeddings
for i, line in enumerate(get_data()):
    if i > 10: #first 10 items
        break
    print(sum(int(w in glove.stoi) for w in split_phrase(line[2])))

In [14]:
def get_phrase_vectors(glove_vector):
  train, validation, test = [], [], []
  for i, line in enumerate(get_data()):
    phrase = line[2]
    if i % 59 == 0:
      phrase = phrase[2]
      idxs = [glove_vector.stoi[w]
              for w in split_phrase(phrase)
              if w in glove_vector.stoi]
      if not idxs:
        continue
      idxs = torch.tensor(idxs)
      label = torch.tensor(int(line[1] != 'path')).long() #label - path label
      if i % 5 < 3:#60%
        train.append((idxs, label))
      elif i % 5 == 4: #20%
        validation.append((idxs, label))
      else: #20%
        test.append((idxs, label))
  return train, validation, test


In [16]:
train, validation, test = get_phrase_vectors(glove)

train_loader = torch.utils.data.DataLoader(train, batch_size=128, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=128, shuffle=True)

In [None]:
phrase, label = train[0]
print(label, phrase)

In [None]:
glove_emb = nn.Embedding.from_pretrained(glove.vectors)

phrase_emb = glove_emb(phrase)
phrase_emb.shape

In [25]:
class PhraseRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(PhraseRNN, self).__init__()
        self.emb = nn.Embedding.from_pretrained(glove.vectors)
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size = 100, hidden_size = 100, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.rnn(x, h0)
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out

model = PhraseRNN(100, 100, 1)

In [None]:
for i in range(10):
  phrase, label = train[i]
  print(phrase.shape)

In [None]:
#pad sequences with zero inputs
from torch.nn.utils.rnn import pad_sequence

phrase_padded = pad_sequence([phrase for phrase, label in train[:10]],
                            batch_first=True)
print(phrase_padded.shape)
print(phrase_padded[0:2])

out = model(phrase_padded)
print(out.shape)