In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import gc
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gensim.models.keyedvectors as word2vec

from sklearn.metrics import roc_auc_score
from nltk.tokenize import WordPunctTokenizer
from collections import Counter, defaultdict

SEED = 41

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [2]:
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [3]:
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            yield ([w2i[x] for x in words.split(" ")], t2i[tag])

In [4]:
# Read in the data
train = list(read_dataset("../data/raw/classes/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/raw/classes/test.txt"))
nwords = len(w2i)
ntags = len(t2i)

In [9]:
# Define the model
EMB_SIZE    = 64
WIN_SIZE    = 3  # kernel size
FILTER_SIZE = 64 

In [11]:
class CNNclass(nn.Module):
    def __init__(self, nwords, emb_size, num_filters, window_size, ntags):
        super(CNNclass, self).__init__()

        """ layers """
        self.embedding = nn.Embedding(nwords, emb_size)
        # uniform initialization
        torch.nn.init.uniform_(self.embedding.weight, -0.25, 0.25)
        # Conv 1d
        self.conv_1d = nn.Conv1d(in_channels=emb_size, out_channels=num_filters, kernel_size=window_size,
                                 stride=1, padding=0, dilation=1, groups=1, bias=True)
        self.relu = nn.ReLU()
        self.projection_layer = nn.Linear(in_features=num_filters, out_features=ntags, bias=True)
        # Initializing the projection layer
        torch.nn.init.xavier_uniform_(self.projection_layer.weight)

    def forward(self, words):
        emb = self.embedding(words)                 # nwords x emb_size
        emb = emb.unsqueeze(0).permute(0, 2, 1)     # 1 x emb_size x nwords
        h = self.conv_1d(emb)                       # 1 x num_filters x nwords
        # Do max pooling
        h = h.max(dim=2)[0]                         # 1 x num_filters
        h = self.relu(h)
        out = self.projection_layer(h)              # size(out) = 1 x ntags
        return out

In [12]:
# initialize the model
model     = CNNclass(nwords, EMB_SIZE, FILTER_SIZE, WIN_SIZE, ntags)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

type     = torch.LongTensor
use_cuda = torch.cuda.is_available()

In [13]:
if use_cuda:
    type = torch.cuda.LongTensor
    model.cuda()

In [15]:
# run on a single instance
words, tag = train[0]

if len(words) < WIN_SIZE:
    words += [0] * (WIN_SIZE - len(words))
    
words_tensor = torch.tensor(words).type(type)
tag_tensor   = torch.tensor([tag]).type(type)
scores       = model(words_tensor)

In [17]:
words_tensor

tensor([ 1,  2,  3,  4,  5,  6,  1,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  9,
        17,  5, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33],
       device='cuda:0')

In [None]:
for ITER in range(1):
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    train_correct = 0.0
    start = time.time()
    
    for words, tag in train:
        # Padding (can be done in the conv layer as well)
        if len(words) < WIN_SIZE:
            words += [0] * (WIN_SIZE - len(words))
        words_tensor = torch.tensor(words).type(type)
        tag_tensor   = torch.tensor([tag]).type(type)
        scores  = model(words_tensor)
        predict = scores[0].argmax().item()
        if predict == tag:
            train_correct += 1

        my_loss = criterion(scores, tag_tensor)
        train_loss += my_loss.item()
        # Do back-prop
        optimizer.zero_grad()
        my_loss.backward()
        optimizer.step()
    print("iter %r: train loss/sent=%.4f, acc=%.4f, time=%.2fs" % (
        ITER, train_loss / len(train), train_correct / len(train), time.time() - start))
    # Perform testing
    test_correct = 0.0
    for words, tag in dev:
        # Padding (can be done in the conv layer as well)
        if len(words) < WIN_SIZE:
            words += [0] * (WIN_SIZE - len(words))
        words_tensor = torch.tensor(words).type(type)
        scores = model(words_tensor)[0]
        predict = scores.argmax().item()
        if predict == tag:
            test_correct += 1
    print("iter %r: test acc=%.4f" % (ITER, test_correct / len(dev)))