In [47]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext
from torchtext import data
from torchtext import datasets
import collections
import math
import numpy as np

In [48]:
import re
import spacy

nlp = spacy.load('en')
MAX_CHARS = 20000
def tokenizer(comment):
    comment = comment.lower()
    comment = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’;#]", " ", str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\,+", ",", comment)
    if (len(comment) > MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return [x.text for x in nlp.tokenizer(comment) if x.text != " "]

In [49]:
TEXT = data.Field(tokenize=tokenizer)
RATING = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
SENTIMENT = data.LabelField()

train_data = data.TabularDataset(path='data/csv/train/boots.csv', format='csv',skip_header=True, fields=[('id',None),('rating',RATING),('text', TEXT), ('sentiment', SENTIMENT)])

POLARITY = data.LabelField()

test_polarity_data = data.TabularDataset(path='data/csv/test/polarities/boots-tst.csv', format='csv',skip_header=True,fields=[('id',None),('text', TEXT), ('label', POLARITY), ('rating', None)])



In [50]:
embedding = torchtext.vocab.Vectors('boots.200d.txt')

MAX_VOCAB_SIZE = 40000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = embedding, 
                 unk_init = torch.Tensor.normal_)

SENTIMENT.build_vocab(train_data)

POLARITY.build_vocab(test_polarity_data)


In [51]:
type(TEXT.vocab.stoi)

collections.defaultdict

In [52]:
import json
with open('./model/sentiment_label_vocab-boots.json', 'w') as outfile:
    json.dump(POLARITY.vocab.itos, outfile)

In [53]:
with open('./model/sentiment_text_vocab-boots.json', 'w') as outfile:
    json.dump(TEXT.vocab.stoi, outfile)

In [54]:
print(SENTIMENT.vocab.itos)

['positive', 'negative']


In [55]:
SENTIMENT.vocab.stoi

defaultdict(None, {'positive': 0, 'negative': 1})

In [56]:
SENTIMENT_stoi = {'positive': 0, 'negative': 1}

In [57]:
print(POLARITY.vocab.itos)

['positive', 'negative']


In [58]:
BATCH_SIZE = 128

if torch.cuda.is_available():
    torch.cuda.set_device(1)
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

train_iterator = data.BucketIterator(
    train_data, 
    batch_size = BATCH_SIZE, 
    device = device,
    sort=False)

test_polarity_iterator = data.BucketIterator(
    test_polarity_data, 
    batch_size = len(test_polarity_data), 
    device = device,
    sort=False)

In [59]:
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

In [60]:
class PseudoLabel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, output_dim, pad_idx, seed_words):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        for param in self.embedding.parameters():
            param.requires_grad = False
        
        #self.conv = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size = (1, embedding_dim))
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = 1, 
                                              kernel_size = (1, embedding_dim)) 
                                    for _ in range(n_filters)
                                    ])
        
        for i in range(len(self.convs)):
            self.convs[i].weight = torch.nn.Parameter(seed_words[i].unsqueeze(0))
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        text = text.permute(1, 0)
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        
        #conved = [F.cosine_similarity(embedded, conv.weight, dim=3) for conv in self.convs] 
        
        #conv_n = [batch size, 1, sent len]
        
        conved = [conv.permute(0, 2, 1) for conv in conved]
            
        #conv_n = [batch size, sent len, 1]
        
        cat = torch.cat(conved,dim=2)
        
        #conv_n = [batch size, sent len, n_filters]
        
        weights = F.max_pool1d(cat, cat.shape[2])
        
        #weights = [batch size, sent len, 1]
        
        embedded = embedded.squeeze(1)
        
        scaled_text = torch.mul(embedded, weights)
        
        #scaled_text = [batch size, sent len, emb dim]
        
        sen_embedded = torch.mean(scaled_text, dim=1)
        
        # sen_embedded = [batch size, emb dim]
        
        sen_embedded = sen_embedded.unsqueeze(1)
        
        # sen_embedded = [batch size, 1, emb dim]
        
        sen_embedded = sen_embedded.unsqueeze(1)
        
        # sen_embedded = [batch size, 1, 1, emb dim]
        
        conved = [F.relu(conv(sen_embedded)).squeeze(3) for conv in self.convs]
        
        #conved = [F.cosine_similarity(sen_embedded, conv.weight, dim=3) for conv in self.convs] 
        
        #conv = [batch size, 1, 1]
        
        cat = torch.cat(conved,dim=2)
            
        #conv = [batch size, 1, n_filters]
        
        q = cat.squeeze(1)
        
        #conv = [batch size, n_filters]
        
        q = F.softmax(q,dim=1)
        
        #q = [batch size, output dim]
        
        """h_norm = get_hnorm(q)
        
        #h_norm = [batch size]
        
        q_null = F.sigmoid(h_norm).unsqueeze(1)
        
        #q_null = [batch size, 1]
        
        q_k = q*(1-q_null)
        
        q_kplus = torch.cat([q_k[:,:2], q_null, q_k[:, 2:]], dim=1)
        
        #q_kplus = [batch size, output dim+1]
        
        fs = torch.sum(q_kplus,dim=0) #[1, output dim+1]
        
        q2fs = torch.div(torch.mul(q_kplus, q_kplus), fs) #[batch size, output dim]"""
        
        fs = torch.sum(q,dim=0) #[1, output dim+1]
        
        q2fs = torch.div(torch.mul(q, q), fs)
        
        sum_ = torch.sum(q2fs,dim=1).unsqueeze(1) #[batch size, 1] 
        
        p = torch.div(q2fs, sum_)
            
        return p, q

In [61]:
class PseudoPolarity(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, output_dim, pad_idx, seed_words):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        for param in self.embedding.parameters():
            param.requires_grad = False
        
        #self.conv = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size = (1, embedding_dim))
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = 1, 
                                              kernel_size = (1, embedding_dim)) 
                                    for _ in range(n_filters)
                                    ])
        
        for i in range(len(self.convs)):
            self.convs[i].weight = torch.nn.Parameter(seed_words[i].unsqueeze(0))
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        text = text.permute(1, 0)
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        
        #conved = [F.cosine_similarity(embedded, conv.weight, dim=3) for conv in self.convs] 
        
        #conv_n = [batch size, 1, sent len]
        
        conved = [conv.permute(0, 2, 1) for conv in conved]
            
        #conv_n = [batch size, sent len, 1]
        
        cat = torch.cat(conved,dim=2)
        
        #conv_n = [batch size, sent len, n_filters]
        
        weights = F.max_pool1d(cat, cat.shape[2])
        
        #weights = [batch size, sent len, 1]
        
        embedded = embedded.squeeze(1)
        
        scaled_text = torch.mul(embedded, weights)
        
        #scaled_text = [batch size, sent len, emb dim]
        
        sen_embedded = torch.mean(scaled_text, dim=1)
        
        # sen_embedded = [batch size, emb dim]
        
        sen_embedded = sen_embedded.unsqueeze(1)
        
        # sen_embedded = [batch size, 1, emb dim]
        
        sen_embedded = sen_embedded.unsqueeze(1)
        
        # sen_embedded = [batch size, 1, 1, emb dim]
        
        conved = [F.relu(conv(sen_embedded)).squeeze(3) for conv in self.convs]
        
        #conved = [F.cosine_similarity(sen_embedded, conv.weight, dim=3) for conv in self.convs] 
        
        #conv = [batch size, 1, 1]
        
        cat = torch.cat(conved,dim=2)
            
        #conv = [batch size, 1, n_filters]
        
        q = cat.squeeze(1)
        
        #conv = [batch size, n_filters]
        
        q = F.softmax(q,dim=1)
        
        #q = [batch size, output dim]
        
        """h_norm = get_hnorm(q)
        
        #h_norm = [batch size]
        
        q_null = F.sigmoid(h_norm).unsqueeze(1)
        
        #q_null = [batch size, 1]
        
        q_k = q*(1-q_null)
        
        q_kplus = torch.cat([q_k[:,:2], q_null, q_k[:, 2:]], dim=1)
        
        #q_kplus = [batch size, output dim+1]
        
        fs = torch.sum(q_kplus,dim=0) #[1, output dim+1]
        
        q2fs = torch.div(torch.mul(q_kplus, q_kplus), fs) #[batch size, output dim]"""
        
        fs = torch.sum(q,dim=0) #[1, output dim+1]
        
        q2fs = torch.div(torch.mul(q, q), fs)
        
        sum_ = torch.sum(q2fs,dim=1).unsqueeze(1) #[batch size, 1] 
        
        p = torch.div(q2fs, sum_)
        
        polarity = q[:,0] - q[:,1]
            
        return polarity

In [62]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        for param in self.embedding.parameters():
            param.requires_grad = False
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        #self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        #self.apply(init_weights)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        text = text.permute(1, 0)
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
        
        #logits = self.fc(cat)
        
        #probs = F.softmax(logits, dim=1)
        
        #classes = torch.max(logits, 1)[1]
            
        #return probs, classes
        return cat

In [63]:
class CombineNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        super().__init__()
        
        self.cnn = CNN(vocab_size, embedding_dim, n_filters, filter_sizes, 1, 
                 dropout, pad_idx)
        
        self.fc1 = nn.Linear(len(filter_sizes) * n_filters, 1)
        
        #self.fc2 = nn.Linear(64, 1)
        
        #self.apply(init_weights)
        
    def forward(self, text):
        
        cat = self.cnn(text)
        
        logits = self.fc1(cat)
        
        #logits = self.fc2(logits)
            
        polarity = torch.tanh(logits)
        
        return polarity

In [64]:
class CombineNet2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        super().__init__()
        
        self.cnn = CNN(vocab_size, embedding_dim, n_filters, filter_sizes, 1, 
                 dropout, pad_idx)
        
        self.fc1 = nn.Linear(len(filter_sizes) * n_filters, 1)
        
        self.fc2 = nn.Linear(1, output_dim)
        
        self.apply(init_weights)
        
    def forward(self, text):
        
        cat = self.cnn(text)
        
        logits = self.fc1(cat)
        
        binary = self.fc2(logits)
        
        probs = F.softmax(binary, dim=1)
        
        classes = torch.max(binary, 1)[1]
            
        polarity = torch.tanh(logits)
        
        return polarity, probs, classes

In [65]:
from sklearn import metrics

def train_metric(preds, label):
    max_preds = preds.argmax(dim=1)
    acc = metrics.accuracy_score(label.cpu().numpy(), max_preds.cpu().numpy())
    return acc

def train(model, pseudolabel, iterator, optimizer):
    
    criterion1 = nn.KLDivLoss()
    criterion2 = nn.MSELoss()
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    pseudolabel.eval()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        pols = model(batch.text)  #[batch size, output dim] 
        
        polarity = pseudolabel(batch.text)
        
        label = batch.rating
        
        loss1 = criterion2(pols.squeeze(1), polarity.detach())
        
        loss2 = criterion2(pols.squeeze(1), label)
        
        #acc = train_metric(probs, label.detach())
        
        loss = loss1 + loss2
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        #epoch_acc += acc
    
    return epoch_loss / len(iterator)#, epoch_acc / len(iterator)


def evaluate(model, eval_data):
    preds = []
    labels = []
    for e in eval_data.examples:
        pol = evaluate_pol(model, e.text)
        if pol > 0:
            preds.append('positive')
        else:
            preds.append('negative')
        labels.append(e.label)

    f1 = metrics.f1_score(labels, preds, average='weighted')
    acc = metrics.accuracy_score(labels, preds)
    return acc, f1


def predict(model, sentence, min_len = 5):
    model.eval()
    if len(sentence) < min_len:
        sentence += ['<pad>'] * (min_len - len(sentence))
    indexed = [TEXT.vocab.stoi[t] for t in sentence]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    pols = model(tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()

def predict_pol(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok for tok in tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    pols = model(tensor)
    return pols.item()

def evaluate_pol(model, sentence, min_len = 5):
    model.eval()
    if len(sentence) < min_len:
        sentence += ['<pad>'] * (min_len - len(sentence))
    indexed = [TEXT.vocab.stoi[t] for t in sentence]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    pols = model(tensor)
    return pols.item()

In [66]:
general_pos_seeds = ['nice','good','perfect','great','recommend','well']
general_neg_seeds = ['awful','difficult','bad','disappointed','waste','problem']

In [67]:
frequent_pos_seeds = ['nice','good','well','product','just','use']
frequent_neg_seeds = ['difficult','broke','awful','sadly','fell','cracked']

In [68]:
specific_pos_seeds = ['highly','love','perfect','great','perfectly','fits','recommend','loves','plenty','room','best']
specific_neg_seeds = ['return','however','returning','stars','broke',"n't",'disappointed', 'bottom','unfortunately','star','corners']

In [69]:
pos_seeds = general_pos_seeds + specific_pos_seeds
neg_seeds = general_neg_seeds + specific_neg_seeds

In [70]:
import collections
senti_seed_words_d = collections.defaultdict(set)
senti_seed_words_d['positive'] = set(pos_seeds)
senti_seed_words_d['negative'] = set(neg_seeds)

senti_seed_words = sorted(senti_seed_words_d.items(), key=lambda x:SENTIMENT_stoi[x[0]])
print(senti_seed_words)

SENTI_SEED_WORDS = []
for w, lst in senti_seed_words:
    temp = []
    for e in lst:
        temp.append(TEXT.vocab.vectors[TEXT.vocab.stoi[e]].unsqueeze(0))
    embeds = torch.cat(temp)
    embed = torch.mean(embeds,dim=0)
    SENTI_SEED_WORDS.append(embed.unsqueeze(0))
SENTI_SEED_WORDS = torch.cat(SENTI_SEED_WORDS)
SENTI_SEED_WORDS = SENTI_SEED_WORDS.unsqueeze(1)
SENTI_SEED_WORDS = SENTI_SEED_WORDS.unsqueeze(1)
print(SENTI_SEED_WORDS.shape)

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
N_FILTERS = 50
FILTER_SIZES = [2,3,4]
KOUTPUT_DIM = len(POLARITY.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

k_senti_model = CombineNet(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, KOUTPUT_DIM, DROPOUT, PAD_IDX)
k_senti_model = k_senti_model.to(device)

k_senti_pseudolabel = PseudoPolarity(INPUT_DIM, EMBEDDING_DIM, KOUTPUT_DIM, KOUTPUT_DIM, PAD_IDX, SENTI_SEED_WORDS)
k_senti_pseudolabel.eval()
k_senti_pseudolabel = k_senti_pseudolabel.to(device)

pretrained_embeddings = TEXT.vocab.vectors

k_senti_model.cnn.embedding.weight.data.copy_(pretrained_embeddings)
k_senti_pseudolabel.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

k_senti_model.cnn.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
k_senti_model.cnn.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

k_senti_pseudolabel.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
k_senti_pseudolabel.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

k_senti_model_optimizer = optim.Adam(filter(lambda p: p.requires_grad, k_senti_model.parameters()))


[('positive', {'nice', 'plenty', 'great', 'best', 'perfectly', 'recommend', 'highly', 'well', 'perfect', 'loves', 'room', 'love', 'fits', 'good'}), ('negative', {'awful', 'disappointed', 'return', 'unfortunately', 'broke', 'however', "n't", 'difficult', 'star', 'waste', 'stars', 'returning', 'problem', 'corners', 'bottom', 'bad'})]
torch.Size([2, 1, 1, 200])


In [71]:
len(TEXT.vocab)

34138

In [72]:
torch.cuda.empty_cache()

In [73]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    
    print("epoch: ",epoch+1)

    train_loss = train(k_senti_model, k_senti_pseudolabel, train_iterator, k_senti_model_optimizer)
    
    print("training loss: ",train_loss)
    #print("training accuracy: ",train_acc)
    
    valid_acc, valid_f1 = evaluate(k_senti_model, test_polarity_data)
    
    print("validation accuracy: ",valid_acc)
    print('validation F1:',valid_f1)
    
torch.cuda.empty_cache()

epoch:  1
training loss:  0.8957118015281438
validation accuracy:  0.6282051282051282
validation F1: 0.6084535256410257
epoch:  2
training loss:  0.6474154520742964
validation accuracy:  0.7115384615384616
validation F1: 0.7107892107892108
epoch:  3
training loss:  0.45386832475465516
validation accuracy:  0.7564102564102564
validation F1: 0.7559285364163413
epoch:  4
training loss:  0.4268621430046881
validation accuracy:  0.7692307692307693
validation F1: 0.7664629223849614
epoch:  5
training loss:  0.4168741622872085
validation accuracy:  0.7564102564102564
validation F1: 0.7541443053070961


In [74]:
preds = []
labels = []
for e in test_polarity_data.examples:
    pol = evaluate_pol(k_senti_model, e.text)
    if pol > 0:
        preds.append('positive')
    else:
        preds.append('negative')
    labels.append(e.label)


#print(metrics.precision_score(labels, preds, average='weighted'))
#print(metrics.recall_score(labels, preds, average='weighted'))
print(metrics.f1_score(labels, preds, average='weighted'))
print(metrics.accuracy_score(labels, preds))

from sklearn.metrics import confusion_matrix
m = confusion_matrix(labels, preds)
print(m)

0.7541443053070961
0.7564102564102564
[[51 26]
 [12 67]]


In [75]:
predict_pol(k_senti_model, "They are cozy warm and comfortable")

0.9387118220329285

In [76]:
predict_pol(k_senti_model, "just like the Fit Flops I wear all summer long .")

0.6828398704528809

In [77]:
predict_pol(k_senti_model, "they would be great for a cruise")

0.8613943457603455

In [78]:
predict_pol(k_senti_model, "and they should be cooler than my Nikes .")

0.2210138887166977

In [79]:
predict_pol(k_senti_model, "I have a blister after a quick two mile walk")

-0.20635566115379333

In [80]:
torch.save(k_senti_model.state_dict(), './model/sentiment-boots.pt')