## Decomposing SST Reviews into subphrases of opposite sentiments

In [333]:
from torchtext import data, datasets
import torch
import os
from collections import Counter
import torch.nn.functional as F
import torch.autograd as autograd
import torch.nn as nn
import numpy as np

## Extract Data

In [362]:
inputs = data.Field(lower='preserve-case')
answers = data.Field(sequential=False, unk_token=None) # y: floats
train, dev, test = datasets.SST.splits(inputs, answers, fine_grained = False, train_subtrees = True)
inputs.build_vocab(train, dev, test)
inputs.vocab.load_vectors('glove.6B.300d')
answers.build_vocab(train)

## Parse the reviews into positive and negative review with opposing sentiment subsentences

In [445]:
def parseTrees(train):
    # Output two list: Positive reviews, Negative reviews with opposing sentiment subsentences
    # List of tuples: (sentence, list of subsentence) where subsentences are of opposing sentiment to sentence.
    # The subsentences are of lengths 1/3 and 2/3 of the given sentence
    positivels = []
    negativels = []
    cur_sentence = []
    for _,sub in enumerate(train):
        # check if sub is subsentence of curr_sentence
        if set(sub.text).issubset(set(cur_sentence)):
            l = len(sub.text)
            # Check if length of subtree is between 1/3 and 2/3
            if (l <= cur_length*2/3.0) and (l >= cur_length/3.0) :
                # get sentiment of subsentence
                sentiment = sub.label
                # add subsentence to corresponding list, want opposing sentiment subphrases
                if sentiment == 'positive' and cur_sentiment == 'negative':
                    negativels[-1][1].append(sub)
                elif sentiment == 'negative' and cur_sentiment == 'positive':
                    positivels[-1][1].append(sub)
        else:
            cur_sentiment = sub.label
            if cur_sentiment == 'negative':
                negativels.append((sub, []))
            elif cur_sentiment == 'positive':
                positivels.append((sub,[]))
            cur_sentence = sub.text
            cur_length = len(cur_sentence)

    # remove all sentence with empty subphrase list
    pls = []
    nls = []
    for e in positivels:
        if len(e[1]): pls.append(e)
    for e in negativels:
        if len(e[1]): nls.append(e)
            
    return [pls, nls]

## Format the lists for the CD

In [447]:
def rolling_window(phrase, sub):
    tups = []
    for i in range(phrase.shape[0]):
        if i + len(sub) > phrase.shape[0]:
            break
        else:
            if np.array_equal(phrase[i:i+len(sub)], sub):
                tups.append((i, i+len(sub)-1))
    return tups

In [471]:
def format_indices(ls):
    formatted_ls = []
    for tup in ls:
        phrase = inputs.numericalize([tup[0].text], device=-1, train=False)
        subphrases = [inputs.numericalize([sub.text], device=-1, train=False) for sub in tup[1]]
        np_phrase = phrase.data.numpy()
        idx_tups = []
        for sub in subphrases:
            np_sub = sub.data.numpy()
            idx_tups += rolling_window(np_phrase, np_sub)
        formatted_ls.append((phrase, idx_tups))
    return formatted_ls

In [472]:
pls, nls = parseTrees(train)
pls2 = format_indices(pls)

In [478]:
# To Do
# Loop over all

[(Variable containing:
      23
    1745
      47
       5
       2
      55
     447
      15
    1425
     602
      36
   14916
       5
   13364
    2810
      15
       4
   12788
       5
    1744
      17
      97
    2707
       7
  [torch.LongTensor of size 24x1], [(1, 10)]), (Variable containing:
     57
     69
   2940
     70
    536
     11
    779
     12
    109
     53
     17
     15
     18
    186
      7
  [torch.LongTensor of size 15x1], [(1, 6), (2, 6)]), (Variable containing:
      65
   16067
    3172
      39
      78
    3960
       7
  [torch.LongTensor of size 7x1], [(0, 2)]), (Variable containing:
     69
   6547
   2432
     70
    137
    224
     15
      2
   1917
     69
     17
    444
     16
   2290
     80
    416
     70
  [torch.LongTensor of size 17x1], [(4, 14)]), (Variable containing:
     52
    676
     46
   2389
     94
    116
   7033
   2424
      4
    666
    417
   1006
    168
     66
     24
    108
     86
     10
     29
     28
 

## Examples of parser

In [204]:
print("Main sentence: ")
print(sentencels[3][0].text)
print("Positive subsentence: ")
print(sentencels[3][1][0][0].text)
print ("Negative subsentence: ")
print(sentencels[3][1][1][0].text)

Main sentence: 
[u'have', u'had', u'enough', u'of', u'plucky', u'british', u'eccentrics', u'with', u'hearts', u'of', u'gold']
Positive subsentence: 
[u'with', u'hearts', u'of', u'gold']
Negative subsentence: 
[u'enough', u'of', u'plucky', u'british', u'eccentrics']


In [181]:
print("Main sentence: ")
print(sentencels[-1][0].text)
print("Positive subsentence: ")
print(sentencels[-1][1][1][0].text)
print ("Positive subsentence: ")
print(sentencels[-1][1][1][1].text)

Main sentence: 
[u'in', u'this', u'case', u'zero', u'.']
Positive subsentence: 
[u'case', u'zero', u'.']
Positive subsentence: 
[u'zero', u'.']


In [197]:
print("Main sentence: ")
print(sentencels[30][0].text)
print("Positive subsentence: ")
print(sentencels[30][1][0][0].text)

Main sentence: 
[u'next', u'pretty', u'good', u'thing']
Positive subsentence: 
[u'good', u'thing']
