<a href="https://colab.research.google.com/github/m-wallner/nlp-document-classification-simple-nn/blob/main/nlp_document_classification_simple_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural language processing: Document classification using a simple neural network

## 1 Imports and data loading


In [None]:
!pip install torchtext==0.8.1
!nvidia-smi

In [None]:
import os
import time
import math
import pickle

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn  import pad_sequence, pack_padded_sequence, pad_packed_sequence

import torchtext
from torchtext.datasets import text_classification
from torchtext.data import Field, Dataset, Example, BucketIterator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

from IPython.display import clear_output

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# Define data paths
data_path = '/content/gdrive/My Drive/Colab Notebooks/data/NLP/A1'
labels_path = '/content/gdrive/My Drive/Colab Notebooks/data/NLP/A1/thedeep/thedeep.labels.txt'

Mounted at /content/gdrive


### 1.1 Loading thedeep dataset

In [None]:
# Load thedeep training dataset into Pandas dataframe
thedeep_df_train = pd.read_csv(
  os.path.join(data_path, 'thedeep/thedeep.medium.train.txt'),
  sep=',',
  names=['sentence_id', 'text', 'label'],
  index_col=0,
  skiprows=[0]
)

# Load thedeep validation dataset into Pandas dataframe
thedeep_df_valid = pd.read_csv(
  os.path.join(data_path, 'thedeep/thedeep.medium.validation.txt'),
  sep=',',
  names=['sentence_id', 'text', 'label'],
  index_col=0,
  skiprows=[0]
)

# Load thedeep test dataset into Pandas dataframe
thedeep_df_test = pd.read_csv(
  os.path.join(data_path, 'thedeep/thedeep.medium.test.txt'),
  sep=',',
  names=['sentence_id', 'text', 'label'],
  index_col=0,
  skiprows=[0]
)

# Show structure of thedeep dataset
thedeep_df_train.head()

Unnamed: 0_level_0,text,label
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1
28291,The primary reported needs for IDPs across the...,4
9695,Some 602 000 IDPs are now spread across the co...,3
7781,South Sudanese soldiers accused of raping at l...,9
31382,"Since the beginning of 2017, 18 882 suspected/...",11
19919,The number of new suspected cholera cases in 2...,11


### 1.2 Basic information about thedeep

In [None]:
# Load label captions
labelcaptions = {}
with open(labels_path) as fr:
  for label in fr:
    vals = label.strip().split(',')
    labelcaptions[vals[1]] = int(vals[0])
    
# Show labels and corresponding numbers
labelcaptions

{'Agriculture': 0,
 'Cross': 1,
 'Education': 2,
 'Food': 3,
 'Health': 4,
 'Livelihood': 5,
 'Logistic': 6,
 'NFI': 7,
 'Nutrition': 8,
 'Protection': 9,
 'Shelter': 10,
 'WASH': 11}

In [None]:
# Show number of training samples per label
thedeep_df_train['label'].value_counts()

4     5419
9     4618
3     4341
10    2553
11    2178
5     1712
2     1278
8     1207
1     1066
7     1054
0      743
6      430
Name: label, dtype: int64

In [None]:
# Show number of validation samples per label
thedeep_df_valid['label'].value_counts()

4     1196
9      960
3      954
10     474
11     463
5      378
2      300
8      264
1      232
7      229
0      168
6       81
Name: label, dtype: int64

In [None]:
# Show number of test samples per label
thedeep_df_test['label'].value_counts()

4     1181
9      957
3      944
10     509
11     484
5      382
2      283
8      272
1      223
7      193
0      177
6       94
Name: label, dtype: int64

## 2 Data preprocessing, word embedding and saving

Just executed once

### 2.1 Define torchtext.Field and apply preprocessing steps to thedeep dataset

In [None]:
# Define torchtext.Field objects for Tensor representation of data
text_field = Field(tokenize='spacy', lower=True, batch_first=True)
label_field = Field(sequential=False, use_vocab=False, batch_first=True)
fields = [('')]

# Apply preprocessing to training, validation and test set
text_train_pre = thedeep_df_train['text'].apply(lambda x: text_field.preprocess(x))
text_valid_pre = thedeep_df_valid['text'].apply(lambda x: text_field.preprocess(x))
text_test_pre = thedeep_df_test['text'].apply(lambda x: text_field.preprocess(x))

text_train_pre



sentence_id
28291    [the, primary, reported, needs, for, idps, acr...
9695     [some, 602,  , 000, idps, are, now, spread, ac...
7781     [south, sudanese, soldiers, accused, of, rapin...
31382    [since, the, beginning, of, 2017, ,, 18, 882, ...
19919    [the, number, of, new, suspected, cholera, cas...
                               ...                        
36292    [cholera, continues, to, spread, in, yemen, ,,...
5566     [an, estimated, 165,000, children, are, expect...
19676    [on, 3, march, 2017, ,, tropical, storm, enawo...
29831    [the, presence, of, uxo, was, reported, in, 15...
27747    [as, at, week, 27, (, july, 1, -, 7, ,, 2017, ...
Name: text, Length: 26599, dtype: object

### 2.2 Load GloVe.6B.300d word embeddings, create dictionary and word embeddings


In [None]:
# Load GloVe6B.300d word embedding - takes a LOOONG time - and build
# GloVe-based vocabulary for all datasets
text_field.build_vocab(text_train_pre, vectors='glove.6B.300d')
text_field.build_vocab(text_valid_pre, vectors='glove.6B.300d')
text_field.build_vocab(text_test_pre, vectors='glove.6B.300d')

100%|█████████▉| 399998/400000 [00:37<00:00, 10858.42it/s]

In [None]:
# Checking total number of different words in corpus after preprocessing
text_pre = [text_train_pre, text_valid_pre, text_test_pre]
dictionary = {}
for text in text_pre:
  for doc in text:
    for word in doc:
      if word not in dictionary: dictionary[word] = 1
      else: dictionary[word] += 1

print(f'Length of dictionary: {len(dictionary)} words')

Length of dictionary: 48817 words


### 2.3 Initialize words not found in vocabulary with random values from a normal distribution

In [None]:
# Get torchtext.vocab instance and show the structure of the tensor.
# The whole corpus is in one big tensor.
text_field.vocab.vectors

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [None]:
# Turn all words which were not contained in Glove vocabulary from zero vectors into random
# vectors with a normal distribution

# Define zero vector to compare other vectors to
zero_tensor = torch.zeros_like(text_field.vocab.vectors[0])

# Turn zero vectors in vocabulary.vectors to random vectors with std = 1
counter = 0
for i, vector in enumerate(text_field.vocab.vectors):
  if torch.all(torch.eq(vector, zero_tensor)):
    text_field.vocab.vectors[i] = torch.randn_like(zero_tensor)
    counter += 1

print(f'{counter} new words initialized randomly with normally distributed values \n')

# Show updated tensor without zero-vectors
text_field.vocab.vectors

4538 new words initialized randomly with normally distributed values 



tensor([[-1.3601, -0.4453,  1.6286,  ..., -0.9567,  0.9818,  0.1765],
        [-2.4346, -0.3123, -0.9448,  ..., -2.1006, -0.7532,  0.8999],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 1.0273,  0.0028, -0.3037,  ..., -2.4012, -0.5784, -0.6563],
        [ 0.3459,  0.4757,  0.1960,  ...,  0.8434,  2.1771,  0.0535],
        [-0.3181, -1.0090,  0.6965,  ..., -0.9082,  0.0988, -1.5894]])

### 2.4 Save preprocessed data

In [None]:
# Pickle preprocessed and embedded data
with open(os.path.join(data_path, 'text_field.pickle'), 'wb') as f:
    pickle.dump(text_field, f)
with open(os.path.join(data_path, 'label_field.pickle'), 'wb') as f:
    pickle.dump(label_field, f)

## 3 Load preprocessed and embedded data and construct Dataset object from pandas dataframe

In [None]:
# Load preprocessed and embedded data
with open(os.path.join(data_path, 'text_field.pickle'), 'rb') as f:
    text_field = pickle.load(f)
with open(os.path.join(data_path, 'label_field.pickle'), 'rb') as f:
    label_field = pickle.load(f)

# Define torchtext Dataset class to load pandas DataFrame
class DataFrameDataset(Dataset):
    def __init__(self, df:pd.DataFrame, fields:list):
        super(DataFrameDataset, self).__init__(
            [Example.fromlist(list(r), fields) for i, r in df.iterrows()], fields
        )

# Construct DataFrameDataset for all datasets
fields = (('text', text_field), ('label', label_field))
train_dataset = DataFrameDataset(df=thedeep_df_train, fields=fields)
valid_dataset = DataFrameDataset(df=thedeep_df_valid, fields=fields)
test_dataset = DataFrameDataset(df=thedeep_df_test, fields=fields)



In [None]:
# Example sentence in torchtext.data.Example object
train_dataset[0].text

['the',
 'primary',
 'reported',
 'needs',
 'for',
 'idps',
 'across',
 'the',
 'whole',
 'of',
 'libya',
 'were',
 'access',
 'to',
 'food',
 ',',
 'health',
 'services',
 'and',
 'shelter',
 '.',
 'the',
 'main',
 'issues',
 'related',
 'to',
 'the',
 'above',
 '-',
 'mentioned',
 'needs',
 'are',
 'that',
 'goods',
 'are',
 'too',
 'expensive',
 'and',
 'therefore',
 'idps',
 'have',
 'limit',
 'access',
 '.',
 'other',
 'issues',
 'cited',
 'for',
 'access',
 'to',
 'health',
 'included',
 'irregular',
 'supply',
 'of',
 'medicines',
 'and',
 'low',
 'quality',
 'of',
 'available',
 'health',
 'services',
 'due',
 'to',
 'overcrowded',
 'facilities',
 ',',
 'lack',
 'of',
 'medical',
 'staff',
 'and',
 'a',
 'diminished',
 'availability',
 'of',
 'female',
 'doctors',
 '.']

## 4 Definition of a simple neural network and training loop

### 4.1 Simple fully connected model

In [None]:
# Define model architecture
class ClassificationAverageModel(nn.Module):
  def __init__(self, vocab_size, embed_dim, num_class):
    super().__init__()
    # Calculate document representation as mean of word vectors
    self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False, mode='mean')
    self.fc = nn.Linear(embed_dim, num_class)
    self.init_weights()

  # Initiating weights method
  def init_weights(self):
    initrange = 1
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.fc.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.zero_()

  # Define forward method
  def forward(self, text, offsets):
    embedded = self.embedding(text, offsets)
    return self.fc(embedded)

In [None]:
BATCH_SIZE = 64
VOCAB_SIZE = len(text_field.vocab.vectors)
EMBED_DIM = 300
N_CLASSES = len(labelcaptions)
N_EPOCHS = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### 4.2 collate_fn function for PyTorch DataLoader

In [None]:
# Data batching
# Text entries have different lengths => use custom function to generate data
# batches and offsets, then pass it to collate_fn in Pytorch DataLoader

# Get vocabulary from text_field
vocabulary = text_field.vocab.stoi

def generate_batch(batch):
  label = torch.tensor([int(entry.label) for entry in batch])
  text = []
  for sample in batch:
    sample_list = [vocabulary[word] for word in sample.text]
    text.append(torch.LongTensor(sample_list))

  offsets = [0] + [len(entry) for entry in text]

  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text = torch.cat(text)

  return text, offsets, label

### 4.3 Train and test functions

In [None]:
# Define an instance of the model class
model = ClassificationAverageModel(VOCAB_SIZE, EMBED_DIM, N_CLASSES).to(device)

# Get vocabulary from text_field
vocabulary = text_field.vocab.stoi

def train(sub_train_):
  train_loss, train_acc = 0, 0
  data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

  for i, (text, offsets, cls) in enumerate(data):
    optimizer.zero_grad()
    
    # Offsets are necessary since the tensor consists of the mean vectors of
    # every document - offsets point to the end of one document vector.
    text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
    output = model(text, offsets)
    loss = criterion(output, cls)
    train_loss += loss.item()
    loss.backward()
    optimizer.step()
    train_acc += (output.argmax(1) == cls).sum().item()

  # Adjust learning rate
  scheduler.step()

  return train_loss / len(sub_train_), train_acc / len(sub_train_)


def test(data_):
  loss, acc = 0, 0
  data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)

  # Offsets are necessary since the tensor consists of the mean vectors of
  # every document - offsets point to the end of one document vector.
  for text, offsets, cls in data:
    text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
    with torch.no_grad(): # we do not want to update any weights here
      output = model(text, offsets)
      loss = criterion(output, cls)
      loss += loss.item()
      acc += (output.argmax(1) == cls).sum().item()

  return loss / len(data_), acc / len(data_)

## 5 Model training and evaluation

Cross Entropy Loss used instead of Negative Log Likelihood Loss, in order to save an additional layer, which would have been otherwise necessary. See PyTorch documentation: https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html

In [None]:
# Split dataset and run model

min_valid_loss = float('inf')

# Set Loss function and optimizer
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(list(model.parameters()), lr=4.0) # Extremely high lr for testing purposes
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

# Get validation set from the training set
train_len = int(len(train_dataset) * 0.90)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

# Implement early stopping
best_val_loss = 10000
counter = 0

for epoch in range(N_EPOCHS):

    # stop time, accuracies and losses
    start_time = time.time()
    train_loss, train_acc = train(train_dataset)
    valid_loss, valid_acc = test(valid_dataset)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

    # early stopping
    if valid_loss < best_val_loss:
      with open('model.pt', 'wb') as f:
        torch.save(model, f) # save current state of model
        best_val_loss = valid_loss
        counter = 0 # reset counter if a new best validation loss was found

    else: # if counter reaches 5, we exit the training loop
      counter += 1
    
    if counter == 5:
      # break for loop and stop training
      print(f'\tEarly stopping triggered - model stopped training process after epoch: {epoch + 1}')
      break

# after training the model and the saving of the best performing model
# we load the best performing model and evaluate it on the test set
with open('model.pt', 'rb') as f:
  model = torch.load(f)

test_loss, test_acc = test(test_dataset)

Epoch: 1  | time in 0 minutes, 1 seconds
	Loss: 31.9914(train)	|	Acc: 46.9%(train)
	Loss: 0.5146(valid)	|	Acc: 49.4%(valid)
Epoch: 2  | time in 0 minutes, 1 seconds
	Loss: 48.4257(train)	|	Acc: 57.6%(train)
	Loss: 0.3751(valid)	|	Acc: 55.2%(valid)
Epoch: 3  | time in 0 minutes, 1 seconds
	Loss: 44.0875(train)	|	Acc: 63.4%(train)
	Loss: 0.0919(valid)	|	Acc: 55.1%(valid)
Epoch: 4  | time in 0 minutes, 1 seconds
	Loss: 35.3688(train)	|	Acc: 67.1%(train)
	Loss: 1.2207(valid)	|	Acc: 56.1%(valid)
Epoch: 5  | time in 0 minutes, 1 seconds
	Loss: 30.3734(train)	|	Acc: 69.5%(train)
	Loss: 0.3380(valid)	|	Acc: 57.2%(valid)
Epoch: 6  | time in 0 minutes, 1 seconds
	Loss: 25.1339(train)	|	Acc: 71.8%(train)
	Loss: 1.0767(valid)	|	Acc: 57.2%(valid)
Epoch: 7  | time in 0 minutes, 1 seconds
	Loss: 21.4813(train)	|	Acc: 73.2%(train)
	Loss: 0.0354(valid)	|	Acc: 57.2%(valid)
Epoch: 8  | time in 0 minutes, 1 seconds
	Loss: 19.0923(train)	|	Acc: 74.2%(train)
	Loss: 0.0000(valid)	|	Acc: 57.5%(valid)
Epoch: 9

**Main sources used:**

Pytorch doc - EmbeddingBag: https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

Glove and Pytorch: https://towardsdatascience.com/deep-learning-for-nlp-with-pytorch-and-torchtext-4f92d69052f