# Sentiment Analyzer

In [1]:
%load_ext autoreload
%autoreload 2

## Read the data:

In [2]:
import allennlp
from allennlp.data.dataset_readers import StanfordSentimentTreeBankDatasetReader 
import os
import torch
import pdb
from typing import Dict
import torch.optim as optim
from allennlp.training.trainer import Trainer



In [3]:
input_dir = '/Users/Max/Downloads/trees/'
EMBEDDING_DIM = 300
HIDDEN_DIM = 200

In [4]:
reader = StanfordSentimentTreeBankDatasetReader()
train_ds = reader.read(os.path.join(input_dir, 'train.txt'))
dev_ds = reader.read(os.path.join(input_dir, 'dev.txt'))

8544it [00:01, 6507.61it/s]
1101it [00:00, 4411.34it/s]


## initialize embeddings and indexer:

In [5]:
from allennlp.data.vocabulary import Vocabulary 
vocab = Vocabulary.from_instances(train_ds + dev_ds, min_count = {'tokens':3})

100%|██████████| 9645/9645 [00:00<00:00, 79603.42it/s]


In [6]:
from allennlp.modules.token_embedders.embedding import Embedding
from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
token_embedding = Embedding(num_embeddings = vocab.get_vocab_size('tokens'), embedding_dim = 300)

word_embedder = BasicTextFieldEmbedder({'tokens':token_embedding})

## Define the model

In [7]:
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.training.metrics.categorical_accuracy import CategoricalAccuracy
from allennlp.nn.util import get_text_field_mask

#@Model.register("lstm_classifier")
class LstmClassifier(Model):
    def __init__(self,
                 embedder: BasicTextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        
        self.word_embedder = embedder
        self.vocab = vocab
        self.encoder = encoder
        
        self.hidden_to_tag = torch.nn.Linear(in_features = self.encoder.get_output_dim(), out_features = self.vocab.get_vocab_size('labels'))
        self.loss_function = torch.nn.CrossEntropyLoss()
        self.accuracy = CategoricalAccuracy()
        
    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        
        mask = get_text_field_mask(tokens)
        embedded = self.word_embedder(tokens)
        encoded = self.encoder(embedded, mask)
        logits = self.hidden_to_tag(encoded)
        
        output = {'logits': logits}
        if label is not None:
            self.accuracy(logits, label)
            output['loss'] = self.loss_function(logits, label)
        return output
    
    def get_metrics(self, reset = False) -> Dict[str, float]:
        return {'accuracy': self.accuracy.get_metric(reset)}
        

## init model

In [8]:
lstm_encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmClassifier(word_embedder, lstm_encoder, vocab)

## Initialize the trainer:

In [9]:
optimizer = optim.Adam(model.parameters(), lr = 1e-4, weight_decay = 1e-5)

In [10]:
from allennlp.data.iterators import BucketIterator
iterator = BucketIterator(batch_size = 32, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)

In [11]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_ds,
                  validation_dataset=dev_ds,
                  patience=10,
                  num_epochs=20)

In [12]:
trainer.train()

accuracy: 0.2666, loss: 1.5767 ||: 100%|██████████| 267/267 [00:22<00:00, 11.73it/s]
accuracy: 0.2579, loss: 1.5715 ||: 100%|██████████| 35/35 [00:00<00:00, 65.49it/s]
accuracy: 0.2782, loss: 1.5632 ||: 100%|██████████| 267/267 [00:22<00:00, 11.81it/s]
accuracy: 0.2616, loss: 1.5686 ||: 100%|██████████| 35/35 [00:00<00:00, 73.22it/s]
accuracy: 0.3164, loss: 1.5293 ||: 100%|██████████| 267/267 [00:21<00:00, 12.39it/s]
accuracy: 0.3406, loss: 1.4824 ||: 100%|██████████| 35/35 [00:00<00:00, 70.10it/s]
accuracy: 0.4002, loss: 1.3745 ||: 100%|██████████| 267/267 [00:22<00:00, 12.13it/s]
accuracy: 0.3678, loss: 1.4006 ||: 100%|██████████| 35/35 [00:00<00:00, 58.45it/s]
accuracy: 0.4930, loss: 1.1875 ||: 100%|██████████| 267/267 [00:22<00:00, 12.05it/s]
accuracy: 0.3869, loss: 1.4110 ||: 100%|██████████| 35/35 [00:00<00:00, 51.34it/s]
accuracy: 0.5679, loss: 1.0363 ||: 100%|██████████| 267/267 [00:23<00:00, 11.55it/s]
accuracy: 0.3806, loss: 1.4598 ||: 100%|██████████| 35/35 [00:00<00:00, 70.

{'best_epoch': 3,
 'peak_cpu_memory_MB': 325.926912,
 'training_duration': '0:04:58.924841',
 'training_start_epoch': 0,
 'training_epochs': 12,
 'epoch': 12,
 'training_accuracy': 0.8181179775280899,
 'training_loss': 0.4988881808318449,
 'training_cpu_memory_MB': 325.926912,
 'validation_accuracy': 0.36966394187102636,
 'validation_loss': 2.4844228676387243,
 'best_validation_accuracy': 0.3678474114441417,
 'best_validation_loss': 1.4005519798823765}