# <span style="color:turquoise">Text classification with pytorch</span>


An example of using natural language processing for sentiment analysis. <br> Building a binary classifier of movie reviews that will predict if a review is positive or negative.




__Dataset:__ IMDB movie reviews from Kaggle<br>
__Model:__ LSTM (?)


### <span style="color:teal">Todo:</span>

- ~~Read dataset~~
- ~~Preprocess text~~
- ~~Split into train, validation, and test sets~~
- ~~Convert text to indices and add paddings~~
- ~~Make model~~
- Make training function
- Make evaluation function
- Train
- Evaluate

In [1]:
import csv
import random
import numpy as np

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

import torch
import torch.nn as nn

import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  return torch._C._cuda_getDeviceCount() > 0


## <span style="color:teal">Read the data and split it into training, cross-validation, and test sets</span>

In [2]:
class Reviews():
    
    def __init__(self):
        self.train = {}
        self.val = {}
        self.test = {}
        self.LABELS = {"positive":1, "negative": 0}
        self.COUNT = {"positive": 0, "negative": 0}
    
    
    def read_data(self):
        
        dataset = []
        
        with open ("IMDB_Dataset.csv", newline='') as f:
            datareader = csv.reader(f, delimiter=',')
            next(datareader, None)

            for row in datareader:
                dataset.append([row[0], self.LABELS[row[1]]])
                self.COUNT[row[1]] += 1
            
            random.shuffle(dataset)
                
        return dataset




    def split_dataset(self,
                      dataset,
                      split=[int(50000*0.6), int(50000*0.2), int(50000*0.2)]):
        
        train, val, test = torch.utils.data.random_split(dataset,
                                               split,
                                               generator=torch.Generator().manual_seed(43))
          
            
        return train, val, test

In [3]:
rev = Reviews()
data = rev.read_data()
pos_count = rev.COUNT["positive"]
neg_count = rev.COUNT["negative"]


In [4]:
print(data[10])

["Well since seeing part's 1 through 3 I can honestly say that they should have NEVER made part 4. Everything from the tacky, and I DO mean tacky score to the really bad acting, I dare anyone to watch this and not be bored out of their minds. I mean parts 1 to 3 kept the vibe strong on the plot of Damion, but without him around in this one it's just not the same. Sure by the end of part 3 I was getting a little tired of the continued story line's, but it was a good closure at the end of the third one. Again there was no reason for a part 4. Even if there was they could have done a MUCH better job than this sh*t I had to sit through, lol. There goes an hour and a half of my life i'll never see again.", 0]


In [5]:
train, val, test = rev.split_dataset(data)
print(train[10])

["Okay, I've tried and I've tried, but I STILL DON'T GET this Guy Maddin thing. Tales From the Gimli Hospital left me cold, that movie about the Austrian villagers and the one about the Ice Nymph were pretty to look but lacking in the story department...and this nudie movie about abortion and hockey is just boring. I'm glad Maddin has an appreciation for silent film, but I dislike his films for the same reason I dislike the films of Quentin Tarantino: they're empty homages to better, more imaginative films--films that advanced the art form or broke new ground--and are all style and no substance. No amount of jump cuts and odd camera angles can disguise the fact that Maddin is an unoriginal David Lynch wannabe, though he DOES have one advantage over Tarantino: he generally doesn't write embarrassing dialogue, because most of his films rely on intertitles. The bottom line is, Maddin's schtick is clever clever film-making for aspiring film majors.", 0]


In [6]:
print(len(train), len(val), len(test))

30000 10000 10000


In [7]:
def split_x_and_y(data):
    x = []
    y = []
    for review, label in data:
        x.append(review)
        y.append(label)
    return x, np.array(y)

In [8]:
train_x_raw, train_y = split_x_and_y(train)
val_x_raw, val_y = split_x_and_y(val)
test_x_raw, test_y = split_x_and_y(test)


print(len(train_x_raw), len(train_y))
print(train_x_raw[50], train_y[50])

30000 30000
I like to think of myself as a bad movie connoisseur. I like to think that the films most people label as the worst of all time I can easily withstand. So...from one bad movie fan to another...let this collect dust on the shelf...grab Up From the Depths or The Great Alligator instead to satisfy your need for something evil lurking in the water. 0


## <span style="color:teal">Preprocess text</span>

In [9]:
def preprocess(review,
               remove_stopwords=False, 
               remove_html=True, 
               remove_punct=False, 
               lowercase=False, 
               lemmatize=False,
               maxlen=128):
    
    review = re.sub(r"\'", "'", review)
    review = re.sub(r"\x96", "-", review)
    
    if remove_html:
        review = re.sub(r'<.*>', ' ', review)
    
    review = word_tokenize(review)
        
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        review = [w for w in review if w not in stop_words]
        
    if remove_punct:
        contractions = ["'ll", "'s", "n't", "'d", "'m", "'ve", "'re"]
        review = [w for w in review if w.isalnum() or w in contractions]
    
    if lowercase:
        review = [w.lower() for w in review]
        
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        review = [lemmatizer.lemmatize(w) for w in review]
    
    
    return review[:maxlen]
    


In [10]:
train_words = [preprocess(review, 
                      lowercase=True, 
                      remove_punct=True,
                      remove_stopwords=True
                     ) 
           for review in train_x_raw]

val_words = [preprocess(review, 
                    lowercase=True, 
                    remove_punct=True,
                    remove_stopwords=True
                   ) 
         for review in val_x_raw]

In [11]:
print(train_words[9592], '\n', val_words[3029])
print(len(train_words[9592]), '\n', len(val_words[3029]))

['my', 'comment', 'mainly', 'comment', 'first', 'commentator', 'extra', 'film', 'unhappy', 'assessment', 'film', 'i', 'think', 'perspective', 'indicates', 'extra', 'extra', 'director', 'director', 'the', 'film', 'sweet', 'acting', 'sufficient', 'experience', 'watching', 'nice', 'diversion', 'busy', 'work', 'week', 'it', "n't", 'the', 'hours', 'acting', 'the', 'matrix', 'special', 'effects', 'even', 'the', 'color', 'purple', 'direction', 'most', 'movies', 'wo', "n't", 'but', 'also', "n't", 'crap', 'fest', 'vinny', 'would', 'lead', 'believe', 'sorry', 'guy', '2', 'cents', 'as', 'movie', 'end', 'gay', 'affirming', '1', 'it', 'showed', 'world', 'full', 'diverse', 'less', 'perfect', 'people', 'know', 'like', '2', 'it', 'opened', 'door', 'one', 'culture', 'without', 'excluding', 'cultures', '3', 'and', 'i', 'liked', 'music', '4'] 
 ['in', 'bygone', 'days', 'catholic', 'church', 'individual', 'ritual', 'would', 'take', 'sins', 'dying', 'person', 'upon', 'often', 'people', 'excommunicate', 'si

## <span style="color:teal">Convert text to indices and add paddings</span>

In [12]:
def make_vocabulary_dicts(preprocessed_data, pad_token='<PAD>', unk_token='<UNK>'):
    vocab = set()
    
    for review in preprocessed_data:
        for word in review:
            vocab.add(word)
    
            
    vocab_sorted = sorted(vocab)
    word2ind = {word : i+2 for i, word in enumerate(vocab_sorted)}
    ind2word = {i+2 : word for i, word in enumerate(vocab_sorted)}
    
    # Prepend the pad token
    word2ind[pad_token] = 0
    ind2word[0] = pad_token
    
    # Prepend the 'unknown' token
    word2ind[unk_token] = 1
    ind2word[1] = unk_token
    
    assert len(word2ind) == len(ind2word)

    
    return word2ind, ind2word

In [13]:
del train_x_raw, val_x_raw

In [14]:
word2ind, ind2word = make_vocabulary_dicts(train_words)

print(len(word2ind), len(ind2word))
print(word2ind['never'], word2ind['awful'])
print(ind2word[6700], ind2word[10582])

61370 61370
37288 4133
booked clutches


In [15]:
print(np.max([len(x) for x in train_words]))
print(np.mean([len(x) for x in train_words]))

print(np.max([len(x) for x in val_words]))
print(np.mean([len(x) for x in val_words]))

128
71.2575
128
70.8972


In [16]:
def make_padded_inputs(preprocessed_data, 
                       vocab, 
                       padded_length=128,
                       pad_token='<PAD>',
                       unk_token='<UNK>'
                      ):
    
    num_lines = len(preprocessed_data)
    pad = vocab[pad_token]
    
    unpadded_lengths = np.zeros(num_lines, dtype='int64')
    
    inputs = np.full((num_lines, padded_length), pad)
    
    for i, review in enumerate(preprocessed_data):
        for j, word in enumerate(review):    
            inputs[i, j] = vocab.get(word, vocab[unk_token])
        unpadded_lengths[i] = j+1
            
    return inputs, unpadded_lengths
            

In [17]:
train_x, train_lengths = make_padded_inputs(train_words, word2ind)
val_x, val_lengths = make_padded_inputs(val_words, word2ind)


print(f"""Training example at index 10:\n{train_words[10]}\n
    Converted to indices:\n{train_x[10, :]}\n 
    Unpadded length of the example:\n{train_lengths[10]}""")

Training example at index 10:
['okay', 'i', "'ve", 'tried', 'i', "'ve", 'tried', 'i', 'still', 'do', 'get', 'guy', 'maddin', 'thing', 'tales', 'from', 'gimli', 'hospital', 'left', 'cold', 'movie', 'austrian', 'villagers', 'one', 'ice', 'nymph', 'pretty', 'look', 'lacking', 'story', 'department', 'nudie', 'movie', 'abortion', 'hockey', 'boring', 'i', "'m", 'glad', 'maddin', 'appreciation', 'silent', 'film', 'i', 'dislike', 'films', 'reason', 'i', 'dislike', 'films', 'quentin', 'tarantino', "'re", 'empty', 'homages', 'better', 'imaginative', 'films', 'films', 'advanced', 'art', 'form', 'broke', 'new', 'ground', 'style', 'substance', 'no', 'amount', 'jump', 'cuts', 'odd', 'camera', 'angles', 'disguise', 'fact', 'maddin', 'unoriginal', 'david', 'lynch', 'wannabe', 'though', 'does', 'one', 'advantage', 'tarantino', 'generally', "n't", 'write', 'embarrassing', 'dialogue', 'films', 'rely', 'intertitles', 'the', 'bottom', 'line', 'maddin', "'s", 'schtick', 'clever', 'clever', 'aspiring', 'film

## <span style="color:teal">Load data into torch</span>

In [18]:
train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
val_dataset = torch.utils.data.TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))

In [23]:
batch_size = 128

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

## <span style="color:teal">Create model</span>

In [24]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, 
                 vocab_size, 
                 d_feature, 
                 num_layers, 
                 hidden_size,
                 n_outputs,
                 bidirectional=False,
                 dropout_rate=0.2):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_feature)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.lstm = nn.LSTM(input_size=d_feature,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bidirectional=bidirectional,
                           batch_first=True)
        self.fc = nn.Linear(hidden_size, n_outputs)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, input_data): 
        
        embedded = self.dropout(self.embedding(input_data))
        #print(f'{embedded.shape=}')
        lstm_out, _ = self.lstm(embedded)
        #print(f'{lstm_out.shape=}')
        
        fc = self.fc(lstm_out[:,-1,:])
        #print(f'{fc.shape=}')
        sigmoid = self.sigmoid(fc)
        #print(f'{sigmoid.shape=}')
   
        return sigmoid
    

In [31]:
vocab_size = len(word2ind)
d_feature = 128
hidden_size = 128
n_outputs = 1
num_layers = 1

model = SentimentClassifier(
                            vocab_size=vocab_size, 
                            d_feature=d_feature,  
                            num_layers=num_layers, 
                            hidden_size=hidden_size, 
                            n_outputs=n_outputs).to(device)

print(model)

SentimentClassifier(
  (embedding): Embedding(61370, 128)
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [36]:
def train_model(train_loader=train_loader,
             val_loader=val_loader,
             model=model,
             train_lengths=train_lengths,
             val_lengths=val_lengths,
             optimizer=torch.optim.Adam(model.parameters(), lr=0.01),
             criterion=nn.BCELoss(),
             n_epochs=5,
             print_every=22):
    
    train_step = 0
    val_step = 0 
    
    
    
    for epoch in range(n_epochs):
        start_time = time.time()
        model.train()
        for inputs, labels in train_loader:  
            model.zero_grad()
            #lengths = train_lengths[batch_size*step : batch_size*(step+1)]
            output = model(inputs)
            #print(labels.shape)
            loss = criterion(output.squeeze(), labels.float())
            train_step +=1
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

           
        model.eval()
            
        val_losses = []
        
        
        for val_inputs, val_labels in val_loader:

            #v_lengths = val_lengths[batch_size*step : batch_size*(step+1)]
            val_output = model(val_inputs)
            #print(f'val_labels shape:{val_labels.shape}')
            val_loss = criterion(val_output.squeeze(), val_labels.float())
            val_losses.append(val_loss.item())

        if (val_step % print_every) == 0:
            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Time taken: {:.2f} min".format((time.time() - start_time) // 60),
                  "Step: {}".format(val_step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(val_losses)))
            val_step +=1
    
    

In [None]:
train_model()