#Binary Text Classification using RNN and LSTM
##Data Preprocessing

In [0]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [0]:
data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [9]:
data = data.rename(index = str, columns = {'v1' : 'labels', 'v2' : 'text'})
data.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [0]:
train, test = train_test_split(data, test_size=0.2, random_state=40)

In [12]:
train.reset_index(drop=True), test.reset_index(drop=True)

(     labels                                               text
 0      spam  XXXMobileMovieClub: To use your credit, click ...
 1       ham               I tot u reach liao. He said t-shirt.
 2       ham           K..k...from tomorrow onwards started ah?
 3       ham  My uncles in Atlanta. Wish you guys a great se...
 4       ham      I love your ass! Do you enjoy doggy style? :)
 ...     ...                                                ...
 4452    ham                   HELLO PEACH! MY CAKE TASTS LUSH!
 4453    ham                Still i have not checked it da. . .
 4454    ham      Oh yeah! And my diet just flew out the window
 4455    ham                        Sounds good, keep me posted
 4456    ham  Yeah we wouldn't leave for an hour at least, h...
 
 [4457 rows x 2 columns],
      labels                                               text
 0       ham  C movie is juz last minute decision mah. Juz w...
 1      spam  Show ur colours! Euro 2004 2-4-1 Offer! Get an...
 2       ham

In [0]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

#Using torchtext to process the data 

In [0]:
import numpy as np
import torch
import torchtext as tt
from torchtext.data import Field, BucketIterator, TabularDataset

Now, using `nltk` 'punkt' tokenizer, the words are tokenized.

In [16]:
import nltk
nltk.download('punkt')

from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
TEXT = tt.data.Field(tokenize=word_tokenize)
LABEL = tt.data.LabelField(dtype=torch.float)

In [0]:
datafields = [("labels", LABEL), ("text", TEXT)]

In [0]:
trn, tst = tt.data.TabularDataset.splits(path = '/content',
                                         train = 'train.csv',
                                         test = 'test.csv',
                                         format = 'csv',
                                         skip_header = True,
                                         fields = datafields)

Illustrating that the tabular csv data has now been transformed into objects.

In [21]:
trn[:5]

[<torchtext.data.example.Example at 0x7f32423d0f98>,
 <torchtext.data.example.Example at 0x7f32423d1be0>,
 <torchtext.data.example.Example at 0x7f32422c2550>,
 <torchtext.data.example.Example at 0x7f32422c26a0>,
 <torchtext.data.example.Example at 0x7f32422c2b38>]

In [22]:
trn[0].__dict__.keys()

dict_keys(['labels', 'text'])

The text has now been tokenized into individual words.

In [23]:
trn[0].text

['XXXMobileMovieClub',
 ':',
 'To',
 'use',
 'your',
 'credit',
 ',',
 'click',
 'the',
 'WAP',
 'link',
 'in',
 'the',
 'next',
 'txt',
 'message',
 'or',
 'click',
 'here',
 '>',
 '>',
 'http',
 ':',
 '//wap',
 '.',
 'xxxmobilemovieclub.com',
 '?',
 'n=QJKGIGHJJGCBL']

In [24]:
print(vars(trn.examples[0]))

{'labels': 'spam', 'text': ['XXXMobileMovieClub', ':', 'To', 'use', 'your', 'credit', ',', 'click', 'the', 'WAP', 'link', 'in', 'the', 'next', 'txt', 'message', 'or', 'click', 'here', '>', '>', 'http', ':', '//wap', '.', 'xxxmobilemovieclub.com', '?', 'n=QJKGIGHJJGCBL']}


Now building a vocabulary using one-hot encoding, limiting the size of our feature vectors to 15000.

In [0]:
TEXT.build_vocab(trn, max_size = 15000)

In [0]:
LABEL.build_vocab(trn)

In [27]:
print(f"Unique tokens in TEXT & LABEL vocabs are as follows respectively: {len(TEXT.vocab)} & {len(LABEL.vocab)}")

Unique tokens in TEXT & LABEL vocabs are as follows respectively: 10615 & 2


The number of unique tokens for TEXT are less than 15000, which shows that there are only 10613 unique tokens in `trn` dataset. Why 10613? The rest two are unk and pad.

In [0]:
batch_size = 64
train_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (trn, tst),
    batch_size = batch_size,
    sort_key = lambda x : len(x.text),
    sort_within_batch = False
)

#RNN

In [0]:
import torch.nn as nn

In [0]:
class RNN(nn.Module):
  def __init__(self, inp, embed, hid, out):
    super().__init__()
    self.embedding = nn.Embedding(inp, embed)
    self.rnn = nn.RNN(embed, hid)
    self.fc = nn.Linear(hid, out)
  def forward(self, text):
    embedded = self.embedding(text)
    output, hidden = self.rnn(embedded)
    hidden_1D = hidden.squeeze(0)
    assert torch.equal(output[-1, :, :], hidden_1D)
    return self.fc(hidden_1D)

##Training the RNN

In [0]:
inp = len(TEXT.vocab)
embed = 100
hid = 256
out = 1

In [0]:
model = RNN(inp, embed, hid, out)

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 1e-6)

In [0]:
criterion = nn.BCEWithLogitsLoss()

In [0]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.labels)

    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds == batch.labels).float()

    acc  = correct.sum() / len(correct)

    loss.backward()

    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [50]:
num_epochs = 5
for epoch in range(num_epochs):
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

  print(f'| epoch: {epoch+1:02} | train loss: {train_loss: .3f} | train acc: {train_acc*100:.2f}%')

| epoch: 01 | train loss:  0.541 | train acc: 86.05%
| epoch: 02 | train loss:  0.524 | train acc: 86.03%
| epoch: 03 | train loss:  0.507 | train acc: 86.15%
| epoch: 04 | train loss:  0.493 | train acc: 86.15%
| epoch: 05 | train loss:  0.480 | train acc: 86.16%


In [0]:
epoch_loss = 0
epoch_acc = 0

In [52]:
model.eval()

RNN(
  (embedding): Embedding(10615, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [53]:
with torch.no_grad():

  for batch in test_iterator:
  
    predictions = model(batch.text).squeeze(1)
  
    loss = criterion(predictions, batch.labels)

    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds == batch.labels).float()

    acc  = correct.sum() / len(correct)

    epoch_loss += loss.item()
    epoch_acc += acc.item()


test_loss = epoch_loss / len(test_iterator)
test_acc = epoch_acc / len(test_iterator)
print(f'| test loss: {test_loss: .3f} | test acc: {test_acc * 100: .2f}% |')

| test loss:  0.590 | test acc:  78.91% |


#LSTM

In [0]:
class RNN(nn.Module):
  def __init__(self, inp, embed, hid, out):
    super().__init__()
    self.embedding = nn.Embedding(inp, embed)
    self.rnn = nn.LSTM(embed, hid)
    self.fc = nn.Linear(hid, out)
    self.dropout = nn.Dropout(0.3)
  def forward(self, text):
    embedded = self.embedding(text)
    embedded_dropout = self.dropout(embedded)
    output, (hidden, _) = self.rnn(embedded_dropout)
    hidden_1D = hidden.squeeze(0)
    assert torch.equal(output[-1, :, :], hidden_1D)
    return self.fc(hidden_1D)

In [0]:
inp = len(TEXT.vocab)
embed = 100
hid = 256
out = 1

In [0]:
model = RNN(inp, embed, hid, out)

In [0]:
optimizer = optim.Adam(model.parameters(), lr = 1e-6)

In [0]:
criterion = nn.BCEWithLogitsLoss()

In [0]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.labels)

    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds == batch.labels).float()

    acc  = correct.sum() / len(correct)

    loss.backward()

    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [62]:
num_epochs = 5
for epoch in range(num_epochs):
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

  print(f'| epoch: {epoch+1:02} | train loss: {train_loss: .3f} | train acc: {train_acc*100:.2f}%')

| epoch: 01 | train loss:  0.670 | train acc: 85.56%
| epoch: 02 | train loss:  0.658 | train acc: 86.40%
| epoch: 03 | train loss:  0.646 | train acc: 86.33%
| epoch: 04 | train loss:  0.636 | train acc: 86.31%
| epoch: 05 | train loss:  0.625 | train acc: 86.28%


In [0]:
epoch_loss = 0
epoch_acc = 0

In [64]:
model.eval()

RNN(
  (embedding): Embedding(10615, 100)
  (rnn): LSTM(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [65]:
with torch.no_grad():

  for batch in test_iterator:
  
    predictions = model(batch.text).squeeze(1)
  
    loss = criterion(predictions, batch.labels)

    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds == batch.labels).float()

    acc  = correct.sum() / len(correct)

    epoch_loss += loss.item()
    epoch_acc += acc.item()


test_loss = epoch_loss / len(test_iterator)
test_acc = epoch_acc / len(test_iterator)
print(f'| test loss: {test_loss: .3f} | test acc: {test_acc * 100: .2f}% |')

| test loss:  0.659 | test acc:  81.13% |


The LSTM implementation helped increase model accuracy, some of which was compensated by accuracy decrese due to the dropout used to prevent over-fitting.