# <span style="color:turquoise">Text classification with pytorch</span>


An example of using natural language processing for sentiment analysis. <br> Building a binary classifier of movie reviews that will predict if a review is positive or negative.




__Dataset:__ IMDB movie reviews from Kaggle<br>
__Model:__ LSTM


### <span style="color:teal">Todo:</span>

- ~~Read dataset~~
- ~~Preprocess text~~
- ~~Split into train, validation, and test sets~~
- ~~Convert text to indices and add paddings~~
- ~~Make model~~
- ~~Make training function~~
- ~~Make evaluation function~~
- ~~Train~~
- Evaluate on test set
- Run inference

In [1]:
import csv
import random
import numpy as np

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

import torch
import torch.nn as nn

import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  return torch._C._cuda_getDeviceCount() > 0


## <span style="color:teal">Read the data and split it into training, cross-validation, and test sets</span>

In [2]:
class Reviews():
    
    def __init__(self):
        self.train = {}
        self.val = {}
        self.test = {}
        self.LABELS = {"positive":1, "negative": 0}
        self.COUNT = {"positive": 0, "negative": 0}
    
    
    def read_data(self):
        
        dataset = []
        
        with open ("IMDB_Dataset.csv", newline='') as f:
            datareader = csv.reader(f, delimiter=',')
            next(datareader, None)

            for row in datareader:
                dataset.append([row[0], self.LABELS[row[1]]])
                self.COUNT[row[1]] += 1
            
            random.shuffle(dataset)
                
        return dataset




    def split_dataset(self,
                      dataset,
                      split=[int(50000*0.6), int(50000*0.2), int(50000*0.2)]):
        
        train, val, test = torch.utils.data.random_split(dataset,
                                               split,
                                               generator=torch.Generator().manual_seed(43))
          
            
        return train, val, test

In [3]:
rev = Reviews()
data = rev.read_data()
pos_count = rev.COUNT["positive"]
neg_count = rev.COUNT["negative"]


In [4]:
print(data[10])

['Hungary can\'t make any good movies. Fact. This is a great example of that. Someone please explain it to me, why critics say this movie is a masterpiece. Calling this an "Art" isn\'t gonna make it better. Sorry Mundruczo, but you failed. Live with it. Even tho you probably won\'t care about my or any other guys opinion scarifying your "child".', 0]


In [5]:
train, val, test = rev.split_dataset(data)
print(train[10])

['Horror-genius Dario Argento is one of my personal favorite directors, and his films "Suspiria", "Phenomena" and "Profondo Rosso" range high on my personal all-time favorite list. "Opera" of 1987 is yet another tantalizing and brilliant film that no Horror lover can afford to miss, and that will keep you on the edge of your chair from the beginning to the end. This stunning and ultra-violent Giallo could well be described as the master\'s nastiest film, which is quite something considering that Argento\'s films are not exactly known for the tameness of their violence. The violence is extreme and very stylized in a brilliant way that makes Opera a film censor\'s nightmare. As usual for Argento\'s films, the violence is extremely graphic and very stylized. "Opera" truly is a brutal film, and what a stylish and atmospheric film it is. This film is absolutely tantalizing and pure suspense from the beginning to the end. The performances are entirely very good, especially Christina Marsilla

In [6]:
print(len(train), len(val), len(test))

30000 10000 10000


In [7]:
def split_x_and_y(data):
    x = []
    y = []
    for review, label in data:
        x.append(review)
        y.append(label)
    return x, np.array(y)

In [8]:
train_x_raw, train_y = split_x_and_y(train)
val_x_raw, val_y = split_x_and_y(val)
test_x_raw, test_y = split_x_and_y(test)


print(len(train_x_raw), len(train_y))
print(train_x_raw[50], train_y[50])

30000 30000
Look no further, this is it, the worst movie ever made. There may be others that are tied, but there are none worse. There can't be. The guidelines ask that you "focus on the content and context". I can't. There isn't enough content to focus on, and that's exactly my point. Sometimes bad is just bad, and this movie would have to be much better than it is to aspire to being only that. 0


## <span style="color:teal">Preprocess text</span>

In [9]:
def preprocess(review,
               remove_stopwords=False, 
               remove_html=True, 
               remove_punct=False, 
               lowercase=False, 
               lemmatize=False,
               maxlen=128):
    
    review = re.sub(r"\'", "'", review)
    review = re.sub(r"\x96", "-", review)
    
    if remove_html:
        review = re.sub(r'<.*>', ' ', review)
    
    review = word_tokenize(review)
        
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        review = [w for w in review if w not in stop_words]
        
    if remove_punct:
        contractions = ["'ll", "'s", "n't", "'d", "'m", "'ve", "'re"]
        review = [w for w in review if w.isalnum() or w in contractions]
    
    if lowercase:
        review = [w.lower() for w in review]
        
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        review = [lemmatizer.lemmatize(w) for w in review]
    
    
    return review[:maxlen]
    


In [10]:
train_words = [preprocess(review, 
                      lowercase=True, 
                      remove_punct=True,
                      remove_stopwords=True
                     ) 
           for review in train_x_raw]

val_words = [preprocess(review, 
                    lowercase=True, 
                    remove_punct=True,
                    remove_stopwords=True
                   ) 
         for review in val_x_raw]

In [11]:
print(train_words[9592], '\n', val_words[3029])
print(len(train_words[9592]), '\n', len(val_words[3029]))

['michael', 'jackson', 'would', 'claimed', 'spot', 'character', 'the', 'golden', 'child', 'loves', 'kids', 'that', "n't", 'work', 'instead', 'eddie', 'murphy', 'save', 'world', 'rescuing', 'kid', 'midas', 'i', 'would', 'strongly', 'suggest', 'future', 'scriptwriters', 'please', 'thoroughly', 'study', 'actor', "'s", 'inane', 'dialogue', 'quirky', 'fantasy', 'adventure', 'comedy', "'s", 'step', 'closer', 'ishtar', 'whatever', 'murphy', 'says', 'best', 'liked', "n't", 'get', 'wrong', 'exquisite', 'comical', 'talent', "n't", 'belong', 'movie', 'went', 'dolittle', 'the', 'violence', 'visuals', 'combined', 'reasons', 'stamp', 'cult', 'camp', 'classic', 'would', "n't", 'made', 'sense', 'hollywood', 'movie', 'fanatics', 'kept', 'cashing', 'guy', 'speaking', 'visuals', 'pulled', 'amazingly', 'well', 'time', 'ronald', 'reagan', "'s", 'presidential', 'fame', 'murphy', 'far', 'better', 'coming', 'to', 'america', '48', 'hrs', 'stale', 'movie', "n't", 'touch', 'golden', 'honey', 'sweet', 'crunchy', 

## <span style="color:teal">Convert text to indices and add paddings</span>

In [12]:
def make_vocabulary_dicts(preprocessed_data, pad_token='<PAD>', unk_token='<UNK>'):
    vocab = set()
    
    for review in preprocessed_data:
        for word in review:
            vocab.add(word)
    
            
    vocab_sorted = sorted(vocab)
    word2ind = {word : i+2 for i, word in enumerate(vocab_sorted)}
    ind2word = {i+2 : word for i, word in enumerate(vocab_sorted)}
    
    # Prepend the pad token
    word2ind[pad_token] = 0
    ind2word[0] = pad_token
    
    # Prepend the 'unknown' token
    word2ind[unk_token] = 1
    ind2word[1] = unk_token
    
    assert len(word2ind) == len(ind2word)

    
    return word2ind, ind2word

In [13]:
del train_x_raw, val_x_raw

In [14]:
word2ind, ind2word = make_vocabulary_dicts(train_words)

print(len(word2ind), len(ind2word))
print(word2ind['never'], word2ind['awful'])
print(ind2word[6700], ind2word[10582])

61449 61449
37522 4173
bondarchuk cloakroom


In [15]:
print(np.max([len(x) for x in train_words]))
print(np.mean([len(x) for x in train_words]))

print(np.max([len(x) for x in val_words]))
print(np.mean([len(x) for x in val_words]))

128
71.141
128
71.329


In [29]:
def make_padded_inputs(preprocessed_data, 
                       vocab, 
                       padded_length=128,
                       pad_token='<PAD>',
                       unk_token='<UNK>'
                      ):
    
    num_lines = len(preprocessed_data)
    pad = vocab[pad_token]
    
    inputs = np.full((num_lines, padded_length), pad)
    
    for i, review in enumerate(preprocessed_data):
        start_position = padded_length - len(review)
        for j, word in enumerate(review):
            inputs[i, (start_position + j)] = vocab.get(word, vocab[unk_token])
            
    return inputs
            

In [37]:
train_x = make_padded_inputs(train_words, word2ind)
val_x = make_padded_inputs(val_words, word2ind)


print(f"""Training example at indices 5 and 6:\n{train_words[5:7]}\n
    Converted to indices:\n{train_x[5:7, :]}\n""")

assert len(train_words[5]) == np.count_nonzero(train_x[5])

Training example at indices 5 and 6:
[['i', 'think', 'consensus', 'pretty', 'unanimous', 'recent', 'tv', 'miniseries', "'s", 'okay', "'s", 'far', 'cry', 'lonesome', 'dove', 'it', 'gets', 'compared', 'latter', 'simply', 'prequel', 'famous', 'story', 'note', 'the', 'title', 'page', 'says', '360', 'minutes', 'that', 'must', 'included', 'tv', 'commercials', 'the', 'dvd', 'version', 'i', 'saw', '4', 'hours', '40', 'minutes'], ['the', 'late', 'great', 'robert', 'bloch', 'author', 'psycho', "n't", 'paying', 'attention', 'scripted', 'tale', 'terror', 'absolutely', 'one', 'scariest', 'movies', 'i', 'ever', 'saw', 'kid', 'i', 'walk', 'miles', 'see', 'movie', 'usually', 'dark', 'i', 'emerged', 'theater', 'seeing', 'horror', 'movie', 'always', 'unnerving', 'particularly', 'one', 'when', 'i', 'opportunity', 'see', 'one', 'several', 'years', 'ago', 'videotape', 'always', 'last', 'resort', 'i', 'surprised', 'well', 'held', 'take', 'terror', 'test', 'watch', 'night', 'alone', 'then', 'tell', "'s", 'sc

## <span style="color:teal">Load data into torch</span>

In [38]:
train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
val_dataset = torch.utils.data.TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))

In [39]:
batch_size = 128

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

In [40]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, 
                 vocab_size, 
                 d_feature, 
                 num_layers, 
                 hidden_size,
                 n_outputs,
                 bidirectional=False,
                 dropout_rate=0.2):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_feature)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.lstm = nn.LSTM(input_size=d_feature,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bidirectional=bidirectional,
                           batch_first=True)
        self.fc = nn.Linear(hidden_size, n_outputs)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, input_data): 
        
        embedded = self.dropout(self.embedding(input_data))
        lstm_out, _ = self.lstm(embedded)
        fc = self.fc(lstm_out[:,-1,:])
        sigmoid = self.sigmoid(fc)
   
        return sigmoid
    

In [41]:
vocab_size = len(word2ind)
d_feature = 128
hidden_size = 128
n_outputs = 1
num_layers = 1

model = SentimentClassifier(
                            vocab_size=vocab_size, 
                            d_feature=d_feature,  
                            num_layers=num_layers, 
                            hidden_size=hidden_size, 
                            n_outputs=n_outputs).to(device)

print(model)

SentimentClassifier(
  (embedding): Embedding(61449, 128)
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


## <span style="color:teal">Train model</span>

In [42]:
def train_model(train_loader=train_loader,
             val_loader=val_loader,
             model=model,
             optimizer=torch.optim.Adam(model.parameters(), lr=0.005),
             criterion=nn.BCELoss(),
             n_epochs=6):
    
    start_time = time.time()
    
    
    for epoch in range(n_epochs):
        model.train()
        for inputs, labels in train_loader:  
            model.zero_grad()
            output = model(inputs)
            loss = criterion(output.squeeze(), labels.float())
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

           
        model.eval()
            
        val_losses = []
        
        
        for val_inputs, val_labels in val_loader:

            val_output = model(val_inputs)
            val_loss = criterion(val_output.squeeze(), val_labels.float())
            val_losses.append(val_loss.item())

        
        print(f"Epoch: {epoch+1}/{ n_epochs}".format(),
              f"Time taken: {((time.time() - start_time) / 60):.2f} min",
              f"Training Loss: {loss.item():.4f}",
              f"Validation Loss: {np.mean(val_losses):.4f}")
            
    print(f"Training completed in {(time.time() - start_time) / 60} min.")
    print(f"Final loss: {loss}\nValidation loss: {val_loss}")
    
    return loss, val_loss
    

In [None]:
train_model(optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
             n_epochs=20)