## Simple CNN Model

Roughly the same implementation as Yoon Kim (2015)

In [1]:
import numpy as np
import random
import sys
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

from collections import defaultdict
from torch.nn.utils.rnn import pad_sequence
from torch.utils import data
from torchsummary import summary

In [2]:
# Check CUDA
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


### Define Program Constants

In [3]:
SOURCE_PREFIX = "google_news_negative300"
#SOURCE_PREFIX = "glove_42b"
#SOURCE_PREFIX = "numberbatch_en1908"
#SOURCE_PREFIX = "freebase_skipgram1000"
VECTOR_SOURCE = "{}_vectors.npy".format(SOURCE_PREFIX)
VOCAB_SOURCE = "{}_words.txt".format(SOURCE_PREFIX)
VECTOR_SIZE = 300

RAW_SOURCE_FOLDER = "topicclass"

TRAIN_FILE = "topicclass_train.txt"
VALID_FILE = "topicclass_valid.txt"
TEST_FILE = "topicclass_test.txt"

### Hard-coded Labels

In [4]:
# Hard code a list of labels so we can map each label to an integer index
hardcoded_labels = ["Agriculture, food and drink",
                    "Art and architecture",
                    "Engineering and technology",
                    "Geography and places",
                    "History",
                    "Language and literature",
                    "Mathematics",
                    "Media and drama",
                    "Miscellaneous",
                    "Music",
                    "Natural sciences",
                    "Philosophy and religion",
                    "Social sciences and society",
                    "Sports and recreation",
                    "Video games",
                    "Warfare"]
hardcoded_labels_dict = {l:i for i, l in enumerate(hardcoded_labels)}
CLASSES = len(hardcoded_labels_dict)
hardcoded_labels_dict["UNK"] = -1
print("Number of CLASSES = {}".format(CLASSES))

Number of CLASSES = 16


### Define a Function to Read Data

In [5]:
def read_dataset(filename):
    
    labels = []
    sentences = []
    
    with open(filename, "r", encoding="utf8") as f:
        
        for line in f:
            label, words = line.strip().split(" ||| ")
            labels.append(label)
            sentences.append(words)
    
    return sentences, labels

### Load Pre-trained Vector Model

In [6]:
# We now load from Numpy arrays instead
pretrained_model = np.load(VECTOR_SOURCE)
vocab = []
with open(VOCAB_SOURCE, "r") as f:
    for line in f:
        vocab.append(line.strip())
print("Loaded vectors of shape: {}".format(pretrained_model.shape))
print("Loaded vocab of length: {}".format(len(vocab)))
vocab_dict = {w:i for i, w in enumerate(vocab)}


Loaded vectors of shape: (3000000, 300)
Loaded vocab of length: 3000000


### Define a Dataset Class

In [7]:
# MyDataset stores the raw text and labels as integers
# When __getitem__ is called, it converts the requested item to a Tensor on the fly

class MyDataset(data.Dataset):
    
    def __init__(self, sentences, labels, vocab_to_int, labels_to_int):
        
        self.sentences = [list(map(lambda x: vocab_to_int[x], sent.split())) for sent in sentences]
        self.labels = [labels_to_int[l] for l in labels]  # map strings to ints
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        return torch.LongTensor(self.sentences[index]), torch.LongTensor([self.labels[index]])
        
    

### Define Dataset and DataLoaders

In [8]:
# Create collate function for padding
# Adapted from tutorial at: https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html
def pad_collate_fn(batch):
    (xx, yy) = zip(*batch)
    x_lens = torch.LongTensor([len(x) for x in xx])
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    yy = torch.cat(yy)
    
    return xx_pad, x_lens, yy

In [9]:
LOADER_PARAMS = {'batch_size': 64,
                 'shuffle': True,
                 'num_workers': 4,
                 'collate_fn': pad_collate_fn}

In [10]:
train_sent, train_labels = read_dataset("{}/{}".format(RAW_SOURCE_FOLDER, TRAIN_FILE))
valid_sent, valid_labels = read_dataset("{}/{}".format(RAW_SOURCE_FOLDER, VALID_FILE))
test_sent, test_labels = read_dataset("{}/{}".format(RAW_SOURCE_FOLDER, TEST_FILE))

# Generate vocab list
data_vocab = set()
for source in [train_sent, valid_sent, test_sent]:
    for sent in source:
        for token in sent.split():
            data_vocab.add(token)
data_vocab = list(data_vocab)
vocab_to_index = {v:i for i, v in enumerate(data_vocab)}

# Define data loaders
train_loader = data.DataLoader(MyDataset(train_sent, train_labels, vocab_to_index, hardcoded_labels_dict), **LOADER_PARAMS)
valid_loader = data.DataLoader(MyDataset(valid_sent, valid_labels, vocab_to_index, hardcoded_labels_dict), **LOADER_PARAMS)
test_loader = data.DataLoader(MyDataset(test_sent, test_labels, vocab_to_index, hardcoded_labels_dict), **LOADER_PARAMS)


### Construct Embedding Matrix

In [11]:
# Create the weight matrix for the embedding layer in the network,
# taking into account unseen words

embedding_matrix = []
for i, word in enumerate(data_vocab):
    if word in vocab_dict.keys():
        embedding_matrix.append(pretrained_model[vocab_dict[word]])
    else:
        embedding_matrix.append(np.random.uniform(-0.1, 0.1, VECTOR_SIZE).astype("float32"))
        
embedding_matrix.append(np.random.uniform(-0.1, 0.1, VECTOR_SIZE).astype("float32")) # UNK
embedding_matrix.append(np.zeros(VECTOR_SIZE).astype("float32")) # Padding
embedding_matrix = np.array(embedding_matrix)
print(embedding_matrix.shape)


(138380, 300)


### The Model

In [12]:
class MyCNN(torch.nn.Module):
    
    def __init__(self, **kwargs):
        super(MyCNN, self).__init__()
        
        self.MODEL = kwargs["MODEL"]
        self.VOCAB_SIZE = kwargs["VOCAB_SIZE"]
        self.CLASSES = kwargs["CLASSES"]
        self.WEIGHT_MATRIX = kwargs["WEIGHT_MATRIX"]
        self.DROPOUT = kwargs["DROPOUT"]
        
        # Create embedding layer, copy weights from pretrained model if any
        self.embedding = nn.Embedding(self.VOCAB_SIZE + 2, VECTOR_SIZE, padding_idx=self.VOCAB_SIZE + 1)
        self.embedding.weight.requires_grad=False # do not train
        self.embedding.weight.data.copy_(torch.from_numpy(self.WEIGHT_MATRIX)) # copy pre-trained weights
        
        # Create conv layers
        self.filter_sizes = [3, 4, 5]
        self.filter_numbers = [150, 150, 150]
        self.reductions = [i - 1 for i in self.filter_sizes]
        self.convs = nn.ModuleList([nn.Conv2d(1, n, (s, kwargs["DIMENSIONS"]), stride=1) for s, n in zip(self.filter_sizes, self.filter_numbers)])
        
        # Create final classfier
        self.fc = nn.Linear(sum(self.filter_numbers), self.CLASSES)
        
    def forward(self, input, input_lens, device, debug):
        
        b, max_len = input.shape
        input = input.long()
        
        # Embed
        x = self.embedding(input).unsqueeze(1) # add one channel
        
        # Create mask
        mask = torch.arange(max_len).expand(len(input_lens), max_len).to(device) < input_lens.unsqueeze(1)

        # DEBUG: MASK CHECK
#         for i in range(len(mask)):
#             nonzeros_mask = (mask[i] == 1).sum()
#             nonzeros_len = input_lens[i]
#             print("input_lens[{}]: {}, mask[{}]: {}".format(i, nonzeros_len, i, nonzeros_mask))
                    
        # Apply conv layers
        conved = [self.convs[i](x).squeeze().permute(1, 0, 2) for i in range(len(self.convs))]
            
        # Zero-out masked elements
        masked_conved = [(conved[i] * mask[:,:-self.reductions[i]]).permute(1, 0, 2) for i in range(len(conved))]

        # Max pool
        pooled = [F.max_pool1d(F.relu(conv), kernel_size=conv.shape[-1]).squeeze() for conv in masked_conved]
            
        # Cat
        cat = torch.cat(pooled, 1)
        cat = F.dropout(cat, p=self.DROPOUT, training=self.training)
        
        # Fully connected
        out = self.fc(cat)
        
        return out
        

### Define Train and Test Functions

In [13]:
def train(network, optimizer, criterion, network_params, train_params):
    
    # Set mode
    network.train()
    
    # Load model if necessary
    if train_params["LOAD_MODEL"]:
        pass  # Implement
    
    for epoch in range(train_params["START_EPOCH"], train_params["END_EPOCH"]):
        
        avg_loss = 0.0
        for batch_num, (data, data_lens, labels) in enumerate(train_params['TRAIN_DATALOADER']):
            
            # Push data and labels to the GPU if available
            data = data.to(train_params["DEVICE"])
            data_lens = data_lens.to(train_params["DEVICE"])
            labels = labels.to(train_params["DEVICE"])
            
            # Zero the optimizer
            optimizer.zero_grad()
            
            # Generate the output
            output = network(data, data_lens, train_params["DEVICE"], train_params["DEBUG_PRINT"])
            
            # Compute the loss and propagate
            loss = criterion(output, labels.long())            
            loss.backward()
            avg_loss += loss.item() # Only for logging purposes
            
            # Clip gradients (apparently it should be here)
            nn.utils.clip_grad_norm_(network.parameters(), train_params["GRADIENT_CLIP"])
            
            # Step the optimizer
            optimizer.step()             
            
            # Print every modulo PRINT_FREQ
            if batch_num % train_params["PRINT_FREQ"] == train_params["PRINT_FREQ"] - 1:
                print('Epoch: {}\tBatch: {}\tAvg-Loss: {:.6f}'.format(epoch + 1, batch_num + 1, avg_loss / train_params["PRINT_FREQ"]))
                avg_loss = 0.0                
                
            # Save memory
            torch.cuda.empty_cache()
            del data
            del labels
            del loss
        
        # Save model if specified
        if train_params["SAVE_MODEL"] and train_params['SAVE_MODEL_PREFIX'] and epoch == train_params['END_EPOCH'] - 1:
            print("Saving model at epoch {}...".format(epoch + 1))
            torch.save(network.state_dict(), "{}-model-{}.pt".format(train_params['SAVE_MODEL_PREFIX'], epoch + 1))
            torch.save(optimizer.state_dict(), "{}-optim-{}.pt".format(train_params['SAVE_MODEL_PREFIX'], epoch + 1))
        
        # Validate data at the end of one epoch
        val_loss, val_acc = evaluate(network, train_params, mode="valid")
        train_loss, train_acc = evaluate(network, train_params, mode="train")
        print('Train Loss: {:.6f}\tTrain Accuracy: {:.6f}\tVal Loss: {:.6f}\tVal Accuracy: {:.6f}'.
              format(train_loss, train_acc, val_loss, val_acc))
        print("\n")
    
def evaluate(network, train_params, mode="test"):
    
    # Set mode
    network.eval()
    
    # Stats
    test_loss = []
    accuracy = 0
    total = 0
    
    # Switch dataset depending on mode
    loader = train_params["TRAIN_DATALOADER"]
    if mode == "test":
        loader = train_params["TEST_DATALOADER"]
    elif mode == "valid":
        loader = train_params["VALID_DATALOADER"]
    
    for batch_num, (data, data_lens, labels) in enumerate(loader):
    
        # Push data and labels to the GPU if available
        data = data.to(train_params["DEVICE"])
        data_lens = data_lens.to(train_params["DEVICE"])
        labels = labels.to(train_params["DEVICE"])
        output = network(data, data_lens, train_params["DEVICE"], train_params["DEBUG_PRINT"])
        
        # Convert output to labels
        _, pred_labels = torch.max(F.softmax(output, dim=1), 1)
        pred_labels = pred_labels.view(-1)
        
        # Compute loss for logging
        loss = criterion(output, labels.long())
        
        accuracy += torch.sum(torch.eq(pred_labels, labels)).item()
        total += len(labels)
        test_loss.extend([loss.item()] * data.size()[0])
        
        # Save memory
        del data
        del labels
        
    return np.mean(test_loss), accuracy / total

    
    

### Weight Initialization

In [14]:
def init_weights(m):
    if type(m) == nn.Conv1d or type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight.data)

### Define Network and Hyperparameters

In [17]:
NETWORK_PARAMS = {}

NETWORK_PARAMS['MODEL'] = pretrained_model
NETWORK_PARAMS['DIMENSIONS'] = 300
NETWORK_PARAMS['VOCAB_SIZE'] = len(data_vocab)
NETWORK_PARAMS['CLASSES'] = CLASSES
NETWORK_PARAMS['DROPOUT'] = 0.5
NETWORK_PARAMS['WEIGHT_MATRIX'] = embedding_matrix

TRAIN_PARAMS = {}

TRAIN_PARAMS['TRAIN_DATALOADER'] = train_loader
TRAIN_PARAMS['VALID_DATALOADER'] = valid_loader
TRAIN_PARAMS['TEST_DATALOADER'] = test_loader
TRAIN_PARAMS['DEVICE'] = device
TRAIN_PARAMS['START_EPOCH'] = 0
TRAIN_PARAMS['END_EPOCH'] = 5
TRAIN_PARAMS["PRINT_FREQ"] = 1000
TRAIN_PARAMS['LOAD_MODEL'] = False
TRAIN_PARAMS['LOAD_MODEL_SOURCE'] = ""
TRAIN_PARAMS["ADAMW_LEARNING_RATE"] = 2e-3
TRAIN_PARAMS["ADAMW_BETAS"] = (0.9, 0.999)
TRAIN_PARAMS["ADAMW_EPS"] = 1e-08
TRAIN_PARAMS["ADAMW_WEIGHT_DECAY"] = 0
TRAIN_PARAMS["GRADIENT_CLIP"] = 3
TRAIN_PARAMS['VECTOR_MODEL'] = pretrained_model
TRAIN_PARAMS['SAVE_MODEL'] = True
TRAIN_PARAMS['SAVE_MODEL_PREFIX'] = "word2vec_150_static"
TRAIN_PARAMS['DEBUG_PRINT'] = True




In [18]:
# Initialize network and weights
network = MyCNN(**NETWORK_PARAMS)
network.apply(init_weights)

MyCNN(
  (embedding): Embedding(138380, 300, padding_idx=138379)
  (convs): ModuleList(
    (0): Conv2d(1, 150, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 150, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 150, kernel_size=(5, 300), stride=(1, 1))
  )
  (fc): Linear(in_features=450, out_features=16, bias=True)
)

In [19]:
# Initialize optimizer and criterion
optimizer = torch.optim.AdamW(network.parameters(),
                              lr=TRAIN_PARAMS["ADAMW_LEARNING_RATE"],
                              betas=TRAIN_PARAMS["ADAMW_BETAS"],
                              eps=TRAIN_PARAMS["ADAMW_EPS"],
                              weight_decay=TRAIN_PARAMS["ADAMW_WEIGHT_DECAY"])
criterion = nn.CrossEntropyLoss()

### Actually Run

In [20]:
network.to(TRAIN_PARAMS['DEVICE'])
train(network, optimizer, criterion, NETWORK_PARAMS, TRAIN_PARAMS)

Epoch: 1	Batch: 1000	Avg-Loss: 0.996696
Epoch: 1	Batch: 2000	Avg-Loss: 0.885159
Epoch: 1	Batch: 3000	Avg-Loss: 0.860190
Train Loss: 0.686035	Train Accuracy: 0.785336	Val Loss: 0.635776	Val Accuracy: 0.808709


Epoch: 2	Batch: 1000	Avg-Loss: 0.653246
Epoch: 2	Batch: 2000	Avg-Loss: 0.667188
Epoch: 2	Batch: 3000	Avg-Loss: 0.676326
Train Loss: 0.498449	Train Accuracy: 0.844362	Val Loss: 0.569024	Val Accuracy: 0.813375


Epoch: 3	Batch: 1000	Avg-Loss: 0.499115
Epoch: 3	Batch: 2000	Avg-Loss: 0.535241
Epoch: 3	Batch: 3000	Avg-Loss: 0.547110
Train Loss: 0.366544	Train Accuracy: 0.883253	Val Loss: 0.664535	Val Accuracy: 0.804044


Epoch: 4	Batch: 1000	Avg-Loss: 0.348082
Epoch: 4	Batch: 2000	Avg-Loss: 0.393928
Epoch: 4	Batch: 3000	Avg-Loss: 0.431382
Train Loss: 0.250029	Train Accuracy: 0.923914	Val Loss: 0.766785	Val Accuracy: 0.794712


Epoch: 5	Batch: 1000	Avg-Loss: 0.245151
Epoch: 5	Batch: 2000	Avg-Loss: 0.292813
Epoch: 5	Batch: 3000	Avg-Loss: 0.324987
Saving model at epoch 5...
Train Loss: 0

### Generate Output

In [88]:
SOURCE_MODEL = "word2vec_baseline_static-model-1.pt"

network = MyCNN(**NETWORK_PARAMS)
network.load_state_dict(torch.load(SOURCE_MODEL))

<All keys matched successfully>

In [94]:
LABEL_LOADER_PARAMS = {'batch_size': 64,
                       'shuffle': False,
                       'num_workers': 4,
                       'collate_fn': pad_collate_fn}

In [95]:
valid_loader = data.DataLoader(MyDataset(valid_sent, valid_labels, vocab_to_index, hardcoded_labels_dict), **LABEL_LOADER_PARAMS)
test_loader = data.DataLoader(MyDataset(test_sent, test_labels, vocab_to_index, hardcoded_labels_dict), **LABEL_LOADER_PARAMS)

In [105]:
def label(network, train_params, loader):
    
    # Set mode
    network.eval()
    
    # Stats
    test_loss = []
    accuracy = 0
    total = 0
    labels_output = []

    for batch_num, (data, data_lens, labels) in enumerate(loader):
    
        # Push data and labels to the GPU if available
        data = data.to(train_params["DEVICE"])
        data_lens = data_lens.to(train_params["DEVICE"])
        labels = labels.to(train_params["DEVICE"])
        output = network(data, data_lens, train_params["DEVICE"], train_params["DEBUG_PRINT"])
        
        # Convert output to labels
        _, pred_labels = torch.max(F.softmax(output, dim=1), 1)
        pred_labels = pred_labels.view(-1)
        
        #print(pred_labels)
        for i in range(len(pred_labels)):
            labels_output.append(hardcoded_labels[pred_labels[i]])
        
        # Save memory
        del data
        del labels
        
    return labels_output

In [107]:
network.to(TRAIN_PARAMS['DEVICE'])
output_labels = label(network, TRAIN_PARAMS, test_loader)
print("Labelled {} lines".format(len(output_labels)))

Labelled 697 lines


In [108]:
LABELS_FILE = "my_dev_labels_errors.txt"
with open(LABELS_FILE, "w") as l:
    for word in output_labels:
        l.write(word)
        l.write("\n")