In [None]:
import torchtext
from torchtext.data import get_tokenizer
tokenizer = get_tokenizer("basic_english")
sentence = "You can now install TorchText using pip!"
tokens = tokenizer(sentence)
print(tokens)


In [None]:
from torchtext.models import T5Transform
padding_idx = 0
eos_idx = 1
max_seq_len = 512
t5_sp_model_path = "[^1^][1]"
transform = T5Transform(
    sp_model_path=t5_sp_model_path,
    max_seq_len=max_seq_len,
    eos_idx=eos_idx,
    padding_idx=padding_idx,
)


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
example_sent = "This is a sample sentence, showing off the stop words filtration."
tokens = word_tokenize(example_sent)
filtered_tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
print(filtered_tokens)
# Output: ['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print(lemmatized_tokens)
# Output: ['sample', 'sentence', ',', 'showing', 'stop', 'word', 'filtration', '.']


In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print(stemmed_tokens)
# Output: ['sampl', 'sentenc', ',', 'show', 'stop', 'word', 'filtrat', '.']


In [None]:
import torchtext
from torchtext import data, datasets
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
# Get datasets
text_field = data.Field(lower=True, batch_first=True, tokenize='spacy')
label_field = data.Field(sequential=False, unk_token = None)
train, test = datasets.TREC.splits(text_field, label_field)
print('Train length:',str(len(train)))
print('Test length:',str(len(test)))
# Show some examples
for i in range(10):
    random_index = random.randint(0,len(train))
    print(' '.join(train.examples[random_index].text), train.examples[random_index].label)
class RNN(nn.Module):
    def __init__(self, num_tokens, embedding_dim, rnn_dim, num_layers, num_classes):
        super(RNN, self).__init__()
        self.embeddings = nn.Embedding(num_tokens, embedding_dim)
        self.rnn = nn.LSTM(input_size = embedding_dim,
                           hidden_size = rnn_dim, 
                           num_layers = num_layers,
                           batch_first = True)
        self.linear = nn.Linear(rnn_dim, num_classes)

    def forward(self, x):
        emb = self.embeddings(x)
        rnn_output, rnn_hidden = self.rnn(emb)
        output = self.linear(rnn_output[:,-1,:])
        return output

n_hidden = 128
model = RNN(word_count, embedding_dim=128, rnn_dim=128, num_layers=1, num_classes=classes_count)
model



In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
lr_decay = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) 
# Scheduled learnint rate, which decays the learning rate exponentially. This could potentially help arrive to a lower minimum.
# Another option could be:
#lr_decay = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=0.95, patience=10 )
batch_size = 32
num_epochs = 5
train_iter = data.BucketIterator(train, 
                                 batch_size=batch_size, 
                                 sort_within_batch=True, 
                                 shuffle = True, 
                                 repeat = False)
test_iter = data.BucketIterator(test, 
                          batch_size=30, 
                          sort_within_batch=True, 
                          shuffle = True, 
                          repeat = False)
# Reset variables 
accuracies = []
losses_train = []
losses_test = []
train_accuracy = 0
step_count = 0
max_accuracy = 0
# Training loop
for i in range(num_epochs):
    print('Training epoch ',i)
    train_iter.init_epoch()
    for batch in train_iter:
        x_train = batch.text
        y_train = batch.label
        # Forward pass
        y_model = model(x_train)
        # Loss function
        loss = loss_function(y_model, y_train)
        losses_train.append(float(loss))
        # Backward pass
        model.zero_grad()
        loss.backward()
        # Update parameters
        optimizer.step()
        # Evaluation in test set
        if step_count%50 == 0:
            # Calculate model in test set by pieces
            model.eval() # Set model to eval (if there is dropout, set it to zero)
            y_model_test_list = []
            y_test_list = []
            for test_batch in test_iter:
                y_model_test_list.append(model(test_batch.text))
                y_test_list.append(test_batch.label)
            model.train() # Set model to train (if there is dropout, not be zero )
            test_iter.init_epoch()
            # Calculate accuracy
            accuracy = float( (torch.cat(y_model_test_list).max(dim=1)[1] ==
            torch.cat(y_test_list)).float().mean() )
            print('Step: ', step_count, 'Accuracy in test set:', accuracy)
            accuracies.append(accuracy)
        lr_decay.step()
        step_count += 1
