# PyTorch LSTM Text Classification

### Import Required Libraries

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torchtext.vocab import Vocab
from torch.utils.data import Dataset, DataLoader
from torchtext.datasets import AG_NEWS
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import spacy
import re
import string

In [2]:
#!spacy download en_core_web_sm

In [3]:
train_iter = AG_NEWS(split='train')
texts = []
labels = []
for (label, text) in train_iter:
    labels.append(label)
    texts.append(text)
df_train = pd.DataFrame({'label': labels, 'text': texts})
df_train.head(5)

Unnamed: 0,label,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


In [4]:
test_iter = AG_NEWS(split='test')
texts = []
labels = []
for (label, text) in test_iter:
    labels.append(label)
    texts.append(text)
df_test = pd.DataFrame({'label': labels, 'text': texts})
df_test.head(5)

Unnamed: 0,label,text
0,3,Fears for T N pension after talks Unions repre...
1,4,The Race is On: Second Private Team Sets Launc...
2,4,Ky. Company Wins Grant to Study Peptides (AP) ...
3,4,Prediction Unit Helps Forecast Wildfires (AP) ...
4,4,Calif. Aims to Limit Farm-Related Smog (AP) AP...


#### Preprocessing

As mentioned earlier, we need to convert our text into a numerical form that can be fed to our model as input. I’ve used spacy for tokenization after removing punctuation, special characters, and lower casing the text:

In [5]:
# #tokenization
# tok = spacy.load('en_core_web_sm')
# def tokenize(text):
#     #text = re.sub(r"[^\x00-\x7F]+", " ", text)
#     #regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
#     #nopunct = regex.sub(" ", text.lower())
#     return [token.text for token in tok.tokenizer(text)]

In [6]:
from torchtext.data.utils import get_tokenizer
tokenize = get_tokenizer('basic_english')

#### Vocabulary to index mapping
We count the number of occurrences of each token in our corpus and get rid of the ones that don’t occur too frequently.

We then create a vocabulary to index mapping and encode our review text using this mapping. I’ve chosen the maximum length of any review to be 70 words because the average length of reviews was around 60. 

In [7]:
#count number of occurences of each word
counts = Counter()
for index, row in df_train.iterrows():
    counts.update(tokenize(row['text']))

In [8]:
vocab = Vocab(counts, min_freq=1)

In [9]:
[vocab[token] for token in ['here', 'is', 'an', 'example']]

[476, 22, 31, 5298]

In [10]:
def encode_sentence(text, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab[word] for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [11]:
encode_sentence('here is the an example')

(array([ 476,   22,    3,   31, 5298,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]),
 5)

In [12]:
X_train = df_train['text'].apply(lambda x: np.array(encode_sentence(x))).values

  """Entry point for launching an IPython kernel.


In [13]:
X_train[0]

array([array([  432,   426,     2,  1606, 14839,   114,    67,     3,   849,
          14,    28,    15,    28,    16, 50726,     4,   432,   375,
          17,    10, 67508,     7, 52259,     4,    43,  4010,   784,
         326,     2,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0]),
       29], dtype=object)

In [14]:
df_train['label'].unique()

array([3, 4, 2, 1])

In [15]:
y_train = df_train['label'].apply(lambda x: int(x) - 1).values

In [16]:
y_train[0]

2

In [17]:
X_test = df_test['text'].apply(lambda x: np.array(encode_sentence(x))).values

  """Entry point for launching an IPython kernel.


In [18]:
X_test[0]

array([array([  870,    12,    84,   138,  1482,    35,   174,  1753,  4059,
         401,    21,  6558, 38435,   234,    68,    43,    17,  4478,
          17,    35,   174,    19, 11302,  2448,   321,   195,  9840,
           2,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0]),
       28], dtype=object)

In [19]:
df_test['label'].unique()

array([3, 4, 2, 1])

In [20]:
y_test = df_test['label'].apply(lambda x: int(x) - 1).values

#### Pytorch Dataset and DataLoader

The dataset is quite straightforward because we’ve already stored our encodings in the input dataframe. We also output the length of the input sequence in each case, because we can have LSTMs that take variable-length sequences.

DataLoader uses dataset to interate through batches. 

In [21]:
class NewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [22]:
train_ds = NewsDataset(X_train, y_train)
valid_ds = NewsDataset(X_test, y_test)

In [23]:
train_dl = DataLoader(train_ds, batch_size=5000, shuffle=True, num_workers=0)
valid_dl = DataLoader(valid_ds, batch_size=5000, shuffle=True, num_workers=0)

### Pytorch training loop

The training loop is pretty standard. I’ve used Adam optimizer and cross-entropy loss.

In [24]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    model.train()
    for epoch in range(epochs):
        sum_loss = 0.0
        total = 0
        for i, (x, y, l) in enumerate(train_dl):
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            val_loss, val_acc, val_rmse = validation_metrics(model, valid_dl)
            if i % 5 == 1:
                print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))
                
def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        X = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [25]:
class LSTM_fixed_len(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (hn, cn) = self.lstm(x)
        return self.linear(hn[-1])

In [26]:
vocab_size = len(vocab)
model_fixed =  LSTM_fixed_len(vocab_size, embedding_dim=100, hidden_dim=100) 

In [27]:
train_model(model_fixed, epochs=30, lr=0.01)

train loss 1.602, val loss 1.524, val accuracy 0.251, and val rmse 1.855
train loss 1.475, val loss 1.395, val accuracy 0.249, and val rmse 1.231
train loss 1.440, val loss 1.389, val accuracy 0.250, and val rmse 1.236
train loss 1.425, val loss 1.389, val accuracy 0.252, and val rmse 1.867
train loss 1.416, val loss 1.387, val accuracy 0.254, and val rmse 1.859
train loss 1.387, val loss 1.385, val accuracy 0.253, and val rmse 1.860
train loss 1.385, val loss 1.383, val accuracy 0.258, and val rmse 1.226
train loss 1.384, val loss 1.382, val accuracy 0.259, and val rmse 1.850
train loss 1.379, val loss 1.356, val accuracy 0.298, and val rmse 1.840
train loss 1.366, val loss 1.272, val accuracy 0.412, and val rmse 1.187
train loss 1.209, val loss 1.181, val accuracy 0.437, and val rmse 1.390
train loss 1.155, val loss 1.113, val accuracy 0.465, and val rmse 1.119
train loss 1.120, val loss 1.070, val accuracy 0.489, and val rmse 1.092
train loss 1.095, val loss 0.999, val accuracy 0.53

In [28]:
# train_model(model_fixed, epochs=30, lr=0.01)

In [29]:
# train_model(model_fixed, epochs=30, lr=0.01)

Reference:
https://towardsdatascience.com/multiclass-text-classification-using-lstm-in-pytorch-eac56baed8df