# Transer Learning Word Embeddings

In [2]:
import numpy as np
from collections import Counter
import string
import torch
from torch.utils.data import Dataset
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader
from argparse import Namespace
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F
import re

## Sequence Vocabulary


In [3]:
class SequenceVocabulary(object):
    """Class to extract and process vocabularies for mapping"""
    
    def __init__(self, token_to_idx=None, mask_token="<MASK>", add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {
            idx: token for token, idx in self._token_to_idx.items()
        }
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._mask_token = mask_token
        
        # add begin and end sequence token
        self._begin_of_seq_token = "<BEGIN-OF-SEQUENCE>"
        self._end_of_seq_token = "<END-OF-SEQUENCE>"
        
        self.begin_seq_index = self.add_token(self._begin_of_seq_token)
        self.end_seq_index = self.add_token(self._end_of_seq_token)

        self.mask_index = self.add_token(mask_token)
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def to_serializeable(self):
        """return a serializeable dictionary"""
        return {
            'token_to_idx': self._token_to_idx,
            'mask_token': self._mask_token,
            'add_unk': self._add_unk,
            'unk_token': self._unk_token
        }
    
    @classmethod
    def from_serializeable(cls, contents):
        """create vocabulary object from serialize dictionary"""
        return cls(**contents)
    
    def add_token(self, token):
        """Add a token and return it's index"""
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """get the index of a token 
        if not exist returns the unk_index"""
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index %d is not in the vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

## Vocabulary

In [4]:
# create vocabulary class
class Vocabulary(object):
    """Class to extract and process vocabularies for mapping"""
    
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {
            idx: token for token, idx in self._token_to_idx.items()
        }

    def to_serializeable(self):
        """return a serializeable dictionary"""
        return {
            'token_to_idx': self._token_to_idx
        }
    
    @classmethod
    def from_serializeable(cls, contents):
        """create vocabulary object from serialize dictionary"""
        return cls(**contents)
    
    def add_token(self, token):
        """Add a token and return it's index"""
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """get the index of a token 
        if not exist returns the unk_index"""
        return self._token_to_idx[token]
        
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index %d is not in the vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

## Vectorizer

In [5]:
class NewsVectorizer(object):
    def __init__(self, title_vocab, category_vocab):
        self.title_vocab = title_vocab
        self.category_vocab = category_vocab

    def vectorize(self, title, vector_length=-1):
        indices = [self.title_vocab.begin_seq_index]
        indices.extend(self.title_vocab.lookup_token(token)
                    for token in title.split(" "))
        indices.append(self.title_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        # create vector representation
        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.title_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, news_df, cutoff=25):
        category_vocab = Vocabulary()
        
        # add the categories to vocabulary
        for category in sorted(set(news_df.category)):
            category_vocab.add_token(category)

        # count the frequency of each word in title
        word_counter = Counter()
        for title in news_df.title:
            for word in title.split(" "):
                if word not in string.punctuation:
                    word_counter[word] += 1
        
        # add word as token when it's more than cutoff
        title_vocab = SequenceVocabulary()
        for token, word_count in word_counter.items():
            if word_count > cutoff:
                title_vocab.add_token(token)

        return cls(title_vocab, category_vocab)

## Dataset

In [6]:
class NewsDataset(Dataset):
    def __init__(self, news_df, vectorizer):
        self.news_df = news_df
        self._vectorizer = vectorizer

        # calculate how long the sequence could be
        # max_seq_length is added by 2 since using
        # begin and end seq token
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, news_df.title)) + 2
        
        self.train_df = self.news_df[self.news_df.split == 'train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.news_df[self.news_df.split == 'val']
        self.val_size = len(self.val_df)
        
        self.test_df = self.news_df[self.news_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.val_size),
                            'test': (self.test_df, self.test_size)}
        
        self.set_split('train')

        # Class weights
        class_counts = news_df.category.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.category_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv, cuda=False):
        """Load dataset from csv and returns the dataset object
        and vectorizer"""
        news_df = pd.read_csv(news_csv)
        train_news_df = news_df[news_df.split == 'train']
        return cls(news_df,
                   NewsVectorizer.from_dataframe(train_news_df))
    
    def get_vectorizer(self):
        """Get vectorizer"""
        return self._vectorizer
    
    def set_split(self, split='train'):
        """Set the split from data"""
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        Args:
            index (int): the index to the data point
        Returns:
            a dict of the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
        
        title_vector = self._vectorizer.vectorize(row.title,
                                                    self._max_seq_length)
        category_index = self._vectorizer.category_vocab.lookup_token(row.category)
        
        return {
            'x_data' : title_vector,
            'y_target' : category_index
        }
    
    def get_num_batches(self, batch_size):
        """Given the batch size return the number of batches in the dataset"""
        return len(self) // batch_size

## Frakenstein Classification CNN Class

In [7]:
class NewsClassifier(nn.Module):
    def __init__(self, embedding_size, num_embeddings,
                 num_channels, hidden_dim, num_classes,
                 dropout_p, pretrained_embeddings= None,
                 padding_idx=0):
        super(NewsClassifier, self).__init__()

        if pretrained_embeddings is None:
            self.emb = nn.Embedding(embedding_dim= embedding_size,
                                    num_embeddings= num_embeddings,
                                    padding_idx= padding_idx)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim= embedding_size,
                                    num_embeddings= num_embeddings,
                                    padding_idx= padding_idx,
                                    _weight= pretrained_embeddings)
            
        # create the network
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size, 
                   out_channels=num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=3),
            nn.ELU()
        )

        self._dropout_p = dropout_p
        self.fc1 = nn.Linear(num_channels, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x_in, apply_softmax=False):
        # embed and permute so features are channels
        x_embedded = self.emb(x_in).permute(0, 2, 1)

        features = self.convnet(x_embedded)

        # average and remove the extra dimension
        remaining_size = features.size(dim=2)
        features = F.avg_pool1d(features, remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self._dropout_p)
        
        # mlp classifier
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self._dropout_p))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

## Helper Functions

In [8]:
def load_glove_from_file(file_path):
    word_to_index = {}
    embeddings = []

    with open(file_path) as fp:
        for index, line in enumerate(fp):
            line = line.split(" ")
            word = line[0]
            embedding = [float(var) for var in line[1:]]
            embeddings.append(embedding)
    
    return word_to_index, np.stack(embeddings)

def make_embedding_matrix(file_path, words):
    word_to_index, embeddings = load_glove_from_file(file_path)
    embedding_size = embeddings.shape[1]
    final_embeddings = np.zeros(len(words), embedding_size)

    for i, word in enumerate(words):
        if word in word_to_index:
            final_embeddings[i, :] = embeddings[word_to_index[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i

    return final_embeddings


## Training Routine


In [9]:
args = Namespace(
    # Data information
    frequency_cutoff = 25,
    model_state_file = '/content/drive/My Drive/Colab Notebooks/Data/model.pth',
    news_csv = '/content/drive/My Drive/Colab Notebooks/Data/news_with_splits.csv',
    save_dir = '/content/drive/My Drive/Colab Notebooks/Data',
    vectorizer_file = '/content/drive/My Drive/Colab Notebooks/Data/vectorizer.json',
    # Model HyperParameters
    embedding_size=300,
    glove_file = '/content/drive/My Drive/Colab Notebooks/Data/Glove/glove.6B.100d.txt',
    use_glove = False,
    hidden_dim = 100,
    num_channels = 100,
    # Training HyperParameters
    batch_size = 128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    momentum=0.1,
    num_epochs=100,
    seed=1337,
    cuda=True,
    dropout=0.1
)

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will
    ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [10]:
# create variables to record
# the training loop
def make_train_state(args):
    return {
        'epoch_index':0,
        'train_loss':[],
        'train_acc':[],
        'val_loss': [],
        'val_acc': [],
        'test_loss': -1,
        'test_acc': -1,
    }

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

train_state = make_train_state(args)

if torch.cuda.is_available() and args.cuda:
  args.cuda = True
else:
  args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Device available ", args.device)

# dataset object
dataset = NewsDataset.load_dataset_and_make_vectorizer(args.news_csv)
dataset.class_weights = dataset.class_weights.to(args.device)

# vectorizer
vectorizer = dataset.get_vectorizer()

# Use GloVe or randomly initialized embeddings
if args.use_glove:
    words = vectorizer.title_vocab._token_to_idx.keys()
    embeddings = make_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

# classifier
classifier = NewsClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.title_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.category_vocab), 
                            dropout_p=args.dropout,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)
classifier = classifier.to(args.device)

# loss function and optimizer
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

print("Embedding dim ", args.embedding_size)

Device available  cuda
Not using pre-trained embeddings
Embedding dim  300


In [None]:
# Create training loop
epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size)-1, 
                          position=1, 
                          leave=True)

dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size)-1, 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index
        # setup batch generator
        # set loss and train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset=dataset,
                                        batch_size=args.batch_size,
                                        device=args.device)
      
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()
      
        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1 zero the gradients
            optimizer.zero_grad()
          
            # step 2 compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
          
            # step 3 compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
          
            # step 4 use loss to produce gradients
            loss.backward()
          
            # step 5 use optimizer to take the gradient step
            optimizer.step()
          
            # step 6 compute the acccuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, 
                                acc=running_acc, 
                                epoch=epoch_index)
            train_bar.update()
          
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)
      
        # Iterate over val dataset
        # setup: batch generator, set loss and acc to 0, set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset,
                                        batch_size=args.batch_size,
                                        device=args.device)
      
        running_loss = 0.
        running_acc = 0.
        classifier.eval()
      
        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
          
            # step 2. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
          
            # step 3. compute the accuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)
            train_state['val_loss'].append(running_loss)
            train_state['val_acc'].append(running_acc)
            
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()
          
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop") 

HBox(children=(FloatProgress(value=0.0, description='training routine', style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='split=train', max=655.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='split=val', max=139.0, style=ProgressStyle(description_wi…

In [None]:
# evaluate the model
dataset.set_split('test')
batch_generator = generate_batches(dataset,
                                   args.batch_size,
                                   device=args.device)

running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
  # compute the output
  y_pred = classifier(x_in=batch_dict['x_data'])

  # compute the loss
  loss = loss_func(y_pred, batch_dict['y_target'])
  loss_batch = loss.item()
  running_loss += (loss_batch - running_loss) / (batch_index + 1)

  # compute the accuracy
  acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
  running_acc += (acc_batch - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("Test loss : {:.3f}".format(train_state['test_loss']))
print("Test acc : {:.3f}".format(train_state['test_acc']))

Test loss : 3.563
Test acc : 76.897


In [None]:
# inference mode
# Preprocess the reviews
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def predict_category(title, classifier, vectorizer, max_length):
    """Predict a News category for a new title
    
    Args:
        title (str): a raw title string
        classifier (NewsClassifier): an instance of the trained classifier
        vectorizer (NewsVectorizer): the corresponding vectorizer
        max_length (int): the max sequence length
            Note: CNNs are sensitive to the input data tensor size. 
                  This ensures to keep it the same size as the training data
    """
    title = preprocess_text(title)
    vectorized_title = \
        torch.tensor(vectorizer.vectorize(title, vector_length=max_length))
    result = classifier(vectorized_title.unsqueeze(0), apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    predicted_category = vectorizer.category_vocab.lookup_index(indices.item())

    return {'category': predicted_category, 
            'probability': probability_values.item()}


def get_samples():
    samples = {}
    for cat in dataset.val_df.category.unique():
        samples[cat] = dataset.val_df.title[dataset.val_df.category==cat].tolist()[:5]
    return samples

val_samples = get_samples()

#title = input("Enter a news title to classify: ")
classifier = classifier.to("cpu")

for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    for sample in sample_group:
        prediction = predict_category(sample, classifier, 
                                      vectorizer, dataset._max_seq_length + 1)
        print("Prediction: {} (p={:0.2f})".format(prediction['category'],
                                                  prediction['probability']))
        print("\t + Sample: {}".format(sample))
    print("-"*30 + "\n")

True Category: Business
Prediction: Business (p=1.00)
	 + Sample: AZ suspends marketing of cancer drug
Prediction: Business (p=1.00)
	 + Sample: Business world has mixed reaction to Perez move
Prediction: Sports (p=1.00)
	 + Sample: Betting Against Bombay
Prediction: Sports (p=1.00)
	 + Sample: Malpractice Insurers Face a Tough Market
Prediction: Sports (p=0.51)
	 + Sample: NVIDIA Is Vindicated
------------------------------

True Category: Sci/Tech
Prediction: Sci/Tech (p=0.99)
	 + Sample: Spies prize webcam #39;s eyes
Prediction: Sci/Tech (p=1.00)
	 + Sample: Sober worm causes headaches
Prediction: World (p=0.96)
	 + Sample: Local Search: Missing Pieces Falling into Place
Prediction: Sci/Tech (p=1.00)
	 + Sample: Hackers baiting Internet users with Beckham pix
Prediction: Sci/Tech (p=0.98)
	 + Sample: Nokia adds BlackBerry support to Series 80 handsets
------------------------------

True Category: Sports
Prediction: Sci/Tech (p=1.00)
	 + Sample: Is Meyer the man to get Irish up?
Pre

In [None]:
categories = vectorizer.category_vocab._token_to_idx.keys()
print("Available categories ", categories)

Available categories  dict_keys(['Business', 'Sci/Tech', 'Sports', 'World'])


In [None]:
title = input("enter news title :")
prediction = predict_category(title, classifier, 
                                      vectorizer, dataset._max_seq_length + 1)
print("Prediction: {} (p={:0.2f})".format(prediction['category'],
                                            prediction['probability']))

enter news title :What the world's most overtouristed destinations look like now
Prediction: Sci/Tech (p=1.00)


## Evaluate

In [1]:
def get_source_sentence(vectorizer, batch_dict, index):
    indices = batch_dict['x_source'][index].cpu().data.numpy()
    vocab = vectorizer.source_vocab
    return sentence_from_indices(indices, vocab)

def get_true_sentence(vectorizer, batch_dict, index):
    return sentence_from_indices(batch_dict['y_target'].cpu().data.numpy()[index], vectorizer.target_vocab)
    
def get_sampled_sentence(vectorizer, batch_dict, index):
    y_pred = model(x_source=batch_dict['x_source'], 
                   x_source_lengths=batch_dict['x_source_length'], 
                   target_sequence=batch_dict['x_target'])
    return sentence_from_indices(torch.max(y_pred, dim=2)[1].cpu().data.numpy()[index], vectorizer.target_vocab)

def get_all_sentences(vectorizer, batch_dict, index):
    return {"source": get_source_sentence(vectorizer, batch_dict, index), 
            "truth": get_true_sentence(vectorizer, batch_dict, index), 
            "sampled": get_sampled_sentence(vectorizer, batch_dict, index)}
    
def sentence_from_indices(indices, vocab, strict=True):
    ignore_indices = set([vocab.mask_index, vocab.begin_seq_index, vocab.end_seq_index])
    out = []
    for index in indices:
        if index == vocab.begin_seq_index and strict:
            continue
        elif index == vocab.end_seq_index and strict:
            return " ".join(out)
        else:
            out.append(vocab.lookup_index(index))
    return " ".join(out)




dataset.set_split('val')
batch_generator = generate_nmt_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
batch_dict = next(batch_generator)
results = get_all_sentences(vectorizer, batch_dict, 1)
results

NameError: ignored