In [143]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
from torchtext.vocab import Vectors, GloVe

In [144]:
TEXT = torchtext.data.Field()
LABEL = torchtext.data.Field(sequential=False)
train, val, test = torchtext.datasets.SST.splits(
    TEXT, LABEL,
    filter_pred=lambda ex: ex.label != 'neutral')

TEXT.build_vocab(train)
LABEL.build_vocab(train)
n_vocab = len(TEXT.vocab)

url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))
n_comps = TEXT.vocab.vectors.size(1)

BATCH_SIZE = 50
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device=-1, repeat=False)

In [158]:
class ConvNetClassifier(nn.Module):
    
    def __init__(self, vecs, dropout_rate=0.5):
        super(ConvNetClassifier, self).__init__()
        self.vecs = vecs
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(3, n_comps))
        self.conv4 = nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(4, n_comps))
        self.conv5 = nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(5, n_comps))
        self.dropout_rate = dropout_rate
        self.dropout = nn.Dropout2d(p=dropout_rate)
        self.linear = nn.Linear(300, 1)
    
    def forward(self, text, training=False):
        while text.size(0) < 5:
            text = torch.cat([text, torch.ones((1, text.size(1))).long()], 0)
        sent_length, batch_size = text.size()
        X = self.vecs[text.data.view(-1,)].view(sent_length, batch_size, n_comps)
        X = X.permute(1, 0, 2)
        X = X.data.unsqueeze_(1)
        X = Variable(X)
        
        # Extract and pool convolutional features
        X3 = F.relu(self.conv3(X))
        X3 = F.max_pool2d(X3, (X3.size(2), 1))
        X4 = F.relu(self.conv4(X))
        X4 = F.max_pool2d(X4, (X4.size(2), 1))
        X5 = F.relu(self.conv5(X))
        X5 = F.max_pool2d(X5, (X5.size(2), 1))
        
        # Dropout for regularization
        if training:
            X3 = self.dropout(X3)
            X4 = self.dropout(X4)
            X5 = self.dropout(X5) 
        
        # Final layer
        X = torch.cat([X3, X4, X5], 1).squeeze()
        probs = F.sigmoid(self.linear(X))
        return torch.cat([probs, 1-probs], 1)

In [58]:
class NaiveBayesClassifier:
    def __init__(self, alpha, beta, n_features):
        # 1 x C vector; dirichlet prior for class distr.
        # C = 2 for binary classification.
        self.alpha = alpha
        self.alpha0 = sum(alpha)

        # 1 x K vector; dirichlet prior for class conditional distr.
        # K = 2 for binary features, otherwise K = max(occurences_of_word_in_text)
        self.beta = beta
        self.beta0 = sum(beta)

        # dimensions of data
        self.C = len(self.alpha) # num classes
        self.K = len(self.beta)  # num possible values for each feature (count)
        self.D = n_features      # num features (size of vocabulary)

        # counts
        self.N = 0
        self.N_c = np.zeros(self.C, dtype=int)
        self.N_cj = np.zeros((self.C, self.D), dtype=int)
        self.N_ckj = np.zeros((self.C, self.K, self.D), dtype=int)

        self.flushed = False

    def fit(self, X, y):
        X = X.astype(int)
        N, _D = X.shape
        self.N += N

        # print("Fitting model")
        for c in range(self.C):
            msk = y == c
            self.N_c[c] += np.sum(msk)
            self.N_cj[c] += np.sum(X[msk], dtype=int, axis=0)
            self.N_ckj[c] += np.apply_along_axis(np.bincount, 0, X[msk], minlength=self.K)

        self.flushed = False

    def predict(self, X):
        X = X.astype(int)

        if not self.flushed:
            # print("Flushing")
            self.pi = np.array([ # class distribution
                np.log(self.N_c[c] + self.alpha[c]) - np.log(self.N + self.alpha0)
                for c in range(self.C)])
            self.mu = np.fromfunction( # log prob of each (class, count, word) tuple
                lambda c, j, k: np.log(self.N_ckj[c, k, j] + self.beta[c]) - np.log(self.N_c[c] + self.beta0),
                (self.C, self.D, self.K), dtype=int)
            self.flushed = True

        # print("Predicting labels")
        p_for_x = lambda x: [ # calculate log probability for x of class c
            self.pi[c] + np.sum([self.mu[c, j, x[j]] for j in range(len(x))])
            for c in range(self.C)]
        ps = np.apply_along_axis(p_for_x, 1, X)
        return ps # get predictions

In [69]:
def bag_of_words(batch, TEXT):
    """
    returns bag of words representation (Variable) of a batch.
    each bag of words has dimension [batch_size, vocab_size].
    """
    V = len(TEXT.vocab)
    X = torch.zeros(batch.text.size(0), V)
    ones = torch.ones(batch.text.size(1))
    for b in range(batch.text.size(0)):
        X[b].index_add_(0, batch.text.data[b], ones)
        X[b][TEXT.vocab.stoi['<pad>']] = 0
    X = Variable(X, requires_grad=False)
    return X

In [118]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [70]:
alpha = a * np.ones(C)
beta = b * np.ones(K)
n_features = len(TEXT.vocab)
nb = NaiveBayesClassifier(alpha, beta, n_features)

In [71]:
X = bag_of_words(batch, TEXT).data.numpy()

In [72]:
for i, batch in enumerate(train_iter):
    batch.text = batch.text.transpose(1, 0)
    X = bag_of_words(batch, TEXT).data.numpy()
    if binary:
        X = X > 0
    y = batch.label.data.numpy() - 1
    nb.fit(X, y)

In [164]:
n, n_corr = 0, 0
for i, batch in enumerate(test_iter):
    probs = cn(batch.text).data.numpy()
    y_pred = probs.argmax(1)
    batch.text = batch.text.transpose(1, 0)
    X = bag_of_words(batch, TEXT).data.numpy()
    if binary:
        X = X > 0
    probs2 = softmax(nb.predict(X))
    y_pred = (probs + probs2).argmax(1)
    y = batch.label.data.numpy() - 1

    n += len(y)
    n_corr += sum(y_pred == y)

In [165]:
n_corr / n

0.79406919275123555

In [160]:
vecs = Variable(TEXT.vocab.vectors, requires_grad=True)
cn = ConvNetClassifier(vecs)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(cn.parameters(), lr=0.0003)
optimizer2 = optim.Adam([cn.vecs], lr=0.0001)
#optimizer = optim.SGD(cn.parameters(), lr=0.03, weight_decay=0.01)
#optimizer = optim.Adadelta(cn.parameters(), lr=0.1)
max_vec_size = 3

for i in range(20):
    train_iter.init_epoch()
    for batch in train_iter:
        cn.zero_grad()
        probs = cn(batch.text, training=True)
        log_probs = torch.log(probs)
        y = batch.label - 1
        loss = loss_function(log_probs, y)
        loss.backward()
        optimizer.step()
#         optimizer2.step()
        
        # Regularization
#         for w in cn.parameters():
#             w_2norm = w.data.norm(2)
#             if w_2norm > max_vec_size:
#                 w.data = max_vec_size / w_2norm * w.data
    print('Iteration #{}: {}'.format(i, loss.data.numpy()[0]))
cn.linear.weight.data *= 0.5

Iteration #0: 0.6453072428703308
Iteration #1: 0.5488080382347107
Iteration #2: 0.5789662599563599
Iteration #3: 0.5029851198196411
Iteration #4: 0.4807746112346649
Iteration #5: 0.4014658033847809
Iteration #6: 0.3421747088432312
Iteration #7: 0.29946184158325195
Iteration #8: 0.41238003969192505
Iteration #9: 0.3111019432544708
Iteration #10: 0.24287666380405426
Iteration #11: 0.19000859558582306
Iteration #12: 0.3005876839160919
Iteration #13: 0.16436639428138733
Iteration #14: 0.1309184730052948
Iteration #15: 0.15655812621116638
Iteration #16: 0.18515485525131226
Iteration #17: 0.27649080753326416
Iteration #18: 0.14792419970035553
Iteration #19: 0.10025647282600403


In [101]:
lengths = []
train_iter.init_epoch()
for batch in train_iter:
    lengths.append(batch.text.size(0))

In [105]:
np.max(lengths)

52

In [93]:
def evaluate(model, data_iter):
    data_iter.init_epoch()
    N = len(data_iter.data())
    n_correct = 0
    data_iter.init_epoch()
    for batch in data_iter:
        probs = model(batch.text)
        _, y_predicted = probs.max(1)
        y_true = batch.label - 1
        n_correct += (y_true == y_predicted).sum().float()
    return (n_correct / N).data.numpy()[0]

In [161]:
evaluate(cn, train_iter)

0.99436414

In [163]:
evaluate(cn, test_iter)

0.79187262

In [229]:
cn.linear.weight.data.norm(2)

2.999999962180317

In [239]:
test_linear.weight.data = test_linear.weight.data * 0.5

In [248]:
cn.linear.weight.data.norm(2)

1.4999999810901585

In [241]:
test_linear.weight

Parameter containing:

Columns 0 to 9 
 0.0919  0.0995 -0.0591 -0.0713 -0.0962 -0.0985 -0.0943 -0.0954  0.0753 -0.0675

Columns 10 to 19 
-0.0731 -0.0407 -0.0846  0.0974 -0.0862 -0.0913  0.0820 -0.0800  0.0970  0.0920

Columns 20 to 29 
 0.0867  0.0855 -0.0874  0.1046  0.0522  0.1021 -0.0905 -0.0797  0.0556 -0.0818

Columns 30 to 39 
-0.0948 -0.0248  0.0984  0.0363 -0.0818 -0.0781 -0.0681  0.0765 -0.0971  0.0563

Columns 40 to 49 
-0.0753 -0.1091 -0.0936 -0.0554 -0.0742 -0.0950 -0.1096  0.0906 -0.0903 -0.0933

Columns 50 to 59 
-0.0866 -0.0796 -0.0287 -0.0763 -0.1002 -0.0852 -0.0730 -0.0770 -0.1007  0.0883

Columns 60 to 69 
-0.0883 -0.1000 -0.0548 -0.0860  0.0809 -0.0816  0.0910 -0.0962  0.0917  0.0973

Columns 70 to 79 
-0.0786  0.1067  0.0942  0.0807 -0.0860 -0.0528  0.0952  0.0870  0.0752 -0.0895

Columns 80 to 89 
-0.1118  0.0762  0.0893 -0.0818  0.0850 -0.1047 -0.1015  0.0950  0.0759 -0.0389

Columns 90 to 99 
-0.0879 -0.0937  0.0425 -0.0923 -0.0821 -0.0937  0.0584 -0.0809  0.072

In [125]:
f = cn.linear.weight * 0.5

In [142]:
max_vec_size = 3
w_2norm = cn.linear.weight.data.norm(2)
cn.linear.weight.data = max_vec_size / w_2norm * cn.linear.weight.data

In [129]:
cn

TypeError: cannot assign 'torch.autograd.variable.Variable' as parameter 'weight' (torch.nn.Parameter or None expected)

In [123]:
g = f.weight * 0.5

In [97]:
test_iter.init_epoch()
N = len(test_iter.data())
n_correct = 0
test_iter.init_epoch()
for batch in test_iter:
    probs = cn(batch.text)
    _, y_predicted = probs.max(1)
    y_true = batch.label - 1
    if (y_true == y_predicted).sum() != 50:
        raise ValueError()
    n_correct += (y_true == y_predicted).sum().float()

RuntimeError: bool value of Variable objects containing non-empty torch.ByteTensor is ambiguous

In [115]:
probs[:, 0][~(y_true == y_predicted).data]

Variable containing:
 0.3969
 0.4813
 0.5928
 0.4208
 0.4645
 0.2857
[torch.FloatTensor of size 6]