In [1]:
import spacy
from spacy import displacy

In [2]:
import torch
import time
import json
import numpy as np
import math
import random
import xml.etree.ElementTree as ET
from subprocess import check_output
from subprocess import CalledProcessError

In [3]:
np.random.seed(1337)
random.seed(1337)
torch.manual_seed(1337)

<torch._C.Generator at 0x10ce83cb0>

In [4]:
class Model(torch.nn.Module):
    def __init__(self, gen_emb, domain_emb, num_classes=3, dropout=0.5, crf=False):
        super(Model, self).__init__()
        self.gen_embedding = torch.nn.Embedding(gen_emb.shape[0], gen_emb.shape[1])
        self.gen_embedding.weight=torch.nn.Parameter(torch.from_numpy(gen_emb), requires_grad=False)
        self.domain_embedding = torch.nn.Embedding(domain_emb.shape[0], domain_emb.shape[1])
        self.domain_embedding.weight=torch.nn.Parameter(torch.from_numpy(domain_emb), requires_grad=False)
    
        self.conv1=torch.nn.Conv1d(gen_emb.shape[1]+domain_emb.shape[1], 128, 5, padding=2 )
        self.conv2=torch.nn.Conv1d(gen_emb.shape[1]+domain_emb.shape[1], 128, 3, padding=1 )
        self.dropout=torch.nn.Dropout(dropout)

        self.conv3=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.conv4=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.conv5=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.linear_ae=torch.nn.Linear(256, num_classes)
        self.crf_flag=crf
        if self.crf_flag:
            from allennlp.modules import ConditionalRandomField
            self.crf=ConditionalRandomField(num_classes)            
          
    def forward(self, x, x_len, x_mask, x_tag=None, testing=False):
        x_emb=torch.cat((self.gen_embedding(x), self.domain_embedding(x) ), dim=2)
        x_emb=self.dropout(x_emb).transpose(1, 2)
        x_conv=torch.nn.functional.relu(torch.cat((self.conv1(x_emb), self.conv2(x_emb)), dim=1) )
        x_conv=self.dropout(x_conv)
        x_conv=torch.nn.functional.relu(self.conv3(x_conv) )
        x_conv=self.dropout(x_conv)
        x_conv=torch.nn.functional.relu(self.conv4(x_conv) )
        x_conv=self.dropout(x_conv)
        x_conv=torch.nn.functional.relu(self.conv5(x_conv) )
        x_conv=x_conv.transpose(1, 2)
        x_logit=self.linear_ae(x_conv)
        if testing:
            if self.crf_flag:
                score=self.crf.viterbi_tags(x_logit, x_mask)
            else:
                x_logit=x_logit.transpose(2, 0)
                score=torch.nn.functional.log_softmax(x_logit).transpose(2, 0)
        else:
            if self.crf_flag:
                score=-self.crf(x_logit, x_tag, x_mask)
            else:
                x_logit=torch.nn.utils.rnn.pack_padded_sequence(x_logit, x_len, batch_first=True)
                score=torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(x_logit.data), x_tag.data)
        return score

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
doc = nlp(u"But Google is starting from behind.")
displacy.render(doc, style="ent", jupyter=True)

In [7]:
x = [{"text": "But Google is starting from behind.",
       "ents": [{"start":0, "end":3, "label":"food"}, {"start": 4, "end": 10, "label": "ambience"}, {"start": 23, "end": 27, "label": "drinks"} ],
       "title": "Review"}]
html = displacy.render(x, style="ent", manual=True, jupyter=True)

In [8]:
# read reviews
# get BOI encoding
# make html 

In [9]:
runs = 1
data_dir = "/sem8/COL772-NLP/DE-CNN/data/prep_data/"
model_dir = "/sem8/COL772-NLP/DE-CNN/script/model/"
domain = "restaurant"

In [10]:
command="java --add-modules java.xml.bind -cp /sem8/COL772-NLP/DE-CNN/script/A.jar absa16.Do Eval -prd /sem8/COL772-NLP/DE-CNN/data/official_data/pred.xml -gld /sem8/COL772-NLP/DE-CNN/data/official_data/EN_REST_SB1_TEST.xml.gold -evs 2 -phs A -sbt SB1"
template="/sem8/COL772-NLP/DE-CNN/data/official_data/EN_REST_SB1_TEST.xml.A"

In [11]:
ae_data=np.load(data_dir+domain+".npz")
with open(data_dir+domain+"_raw_test.json") as f:
    raw_X=json.load(f)

In [12]:
r=0
test_X = ae_data['test_X']
batch_size=128
crf = False

In [13]:
model=torch.load(model_dir+domain+str(r))

  "type " + container_type.__name__ + ". It won't be checked "


In [14]:
model.eval()

Model(
  (gen_embedding): Embedding(8518, 300)
  (domain_embedding): Embedding(8518, 100)
  (conv1): Conv1d(400, 128, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv2): Conv1d(400, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (dropout): Dropout(p=0.55)
  (conv3): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv4): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv5): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
  (linear_ae): Linear(in_features=256, out_features=3, bias=True)
)

In [15]:
pred_y=np.zeros((test_X.shape[0], 83), np.int16)
model.eval()
for offset in range(0, test_X.shape[0], batch_size):
    batch_test_X_len=np.sum(test_X[offset:offset+batch_size]!=0, axis=1)
    batch_idx=batch_test_X_len.argsort()[::-1]
    batch_test_X_len=batch_test_X_len[batch_idx]
    batch_test_X_mask=(test_X[offset:offset+batch_size]!=0)[batch_idx].astype(np.uint8)
    batch_test_X=test_X[offset:offset+batch_size][batch_idx]
    # batch_test_X_mask=torch.autograd.Variable(torch.from_numpy(batch_test_X_mask).long().cuda() )
    batch_test_X_mask=torch.autograd.Variable(torch.from_numpy(batch_test_X_mask).long())
    # batch_test_X = torch.autograd.Variable(torch.from_numpy(batch_test_X).long().cuda() )
    batch_test_X = torch.autograd.Variable(torch.from_numpy(batch_test_X).long())
    batch_pred_y=model(batch_test_X, batch_test_X_len, batch_test_X_mask, testing=True)
    r_idx=batch_idx.argsort()
    if crf:
        batch_pred_y=[batch_pred_y[idx] for idx in r_idx]
        for ix in range(len(batch_pred_y) ):
            for jx in range(len(batch_pred_y[ix]) ):
                pred_y[offset+ix,jx]=batch_pred_y[ix][jx]
    else:
        batch_pred_y=batch_pred_y.data.cpu().numpy().argmax(axis=2)[r_idx]
        pred_y[offset:offset+batch_size,:batch_pred_y.shape[1]]=batch_pred_y
        print(batch_pred_y)
model.train()
assert len(pred_y)==len(test_X)

command=command.split()
print("COMMAND: ")
print(command)
if domain=='restaurant':
    print(pred_y)



[[0 0 0 ... 2 2 2]
 [1 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 ...
 [0 1 0 ... 2 2 2]
 [0 1 0 ... 2 2 2]
 [0 1 0 ... 2 2 2]]
[[0 1 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 [0 1 2 ... 2 2 2]
 ...
 [0 0 0 ... 2 2 2]
 [0 1 0 ... 2 2 2]
 [0 1 0 ... 2 2 2]]
[[0 0 1 ... 2 2 2]
 [0 1 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 ...
 [0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]]
[[0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 ...
 [0 1 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]]
[[0 1 2 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 ...
 [0 0 0 ... 2 2 2]
 [0 1 0 ... 2 2 2]
 [0 1 1 ... 2 2 2]]
[[0 1 0 ... 2 2 2]
 [0 1 0 ... 2 2 2]
 [0 1 0 ... 2 2 2]
 ...
 [0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]]
COMMAND: 
['java', '--add-modules', 'java.xml.bind', '-cp', '/sem8/COL772-NLP/DE-CNN/script/A.jar', 'absa16.Do', 'Eval', '-prd', '/sem8/COL772-NLP/DE-CNN/data/official_data/pred.xml', '-gld', '/sem8/COL772-NLP/DE-CNN/data/official_data/EN_REST_SB1_TEST.xml.gold', '-evs', '2', '-p

In [16]:
def label_rest_xml(fn, output_fn, corpus, label):
    dom=ET.parse(fn)
    root=dom.getroot()
    pred_y=[]
    for zx, sent in enumerate(root.iter("sentence") ) :
        tokens=corpus[zx]
        lb=label[zx]
        opins=ET.Element("Opinions")
        token_idx, pt, tag_on=0, 0, False
        start, end=-1, -1
        for ix, c in enumerate(sent.find('text').text):
            if token_idx<len(tokens) and pt>=len(tokens[token_idx] ):
                pt=0
                token_idx+=1

            if token_idx<len(tokens) and lb[token_idx]==1 and pt==0 and c!=' ':
                if tag_on:
                    end=ix
                    tag_on=False
                    opin=ET.Element("Opinion")
                    opin.attrib['target']=sent.find('text').text[start:end]
                    opin.attrib['from']=str(start)
                    opin.attrib['to']=str(end)
                    opins.append(opin)
                start=ix
                tag_on=True
            elif token_idx<len(tokens) and lb[token_idx]==2 and pt==0 and c!=' ' and not tag_on:
                start=ix
                tag_on=True
            elif token_idx<len(tokens) and (lb[token_idx]==0 or lb[token_idx]==1) and tag_on and pt==0:
                end=ix
                tag_on=False 
                opin=ET.Element("Opinion")
                opin.attrib['target']=sent.find('text').text[start:end]
                opin.attrib['from']=str(start)
                opin.attrib['to']=str(end)
                opins.append(opin)
            elif token_idx>=len(tokens) and tag_on:
                end=ix
                tag_on=False 
                opin=ET.Element("Opinion")
                opin.attrib['target']=sent.find('text').text[start:end]
                opin.attrib['from']=str(start)
                opin.attrib['to']=str(end)
                opins.append(opin)
            if c==' ':
                pass
            elif tokens[token_idx][pt:pt+2]=='``' or tokens[token_idx][pt:pt+2]=="''":
                pt+=2
            else:
                pt+=1
        if tag_on:
            tag_on=False
            end=len(sent.find('text').text)
            opin=ET.Element("Opinion")
            opin.attrib['target']=sent.find('text').text[start:end]
            opin.attrib['from']=str(start)
            opin.attrib['to']=str(end)
            opins.append(opin)
        sent.append(opins )
    dom.write(output_fn)

In [17]:
raw_X[0], pred_y[0], test_X[0]

(['Yum', '!'],
 array([0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int16),
 array([7886,  123,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0], dtype=int32))

In [18]:
pred_y[1]

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int16)

In [19]:
texts = []
labels = []
previous_y = 0
for i, sent in enumerate(raw_X):
    text = ""
    ents = []
    start = 0
    curr_start = 0
    curr_end = 0
    total_chars = 0
    for j, word in enumerate(sent):
        curr_num_char = len(word)
        text+= word+" "
        if pred_y[i][j] == 0:
            if previous_y ==1:
                curr_end = total_chars-1
                ents.append((curr_start,curr_end, " 1 "))
            elif previous_y ==2:
                curr_end = total_chars-1
                ents.append((curr_start,curr_end, " 2 "))
        if pred_y[i][j] ==1:
            curr_start = total_chars
        if pred_y[i][j] ==2:
            curr_end = total_chars+curr_num_char
        total_chars += curr_num_char+1
        previous_y=pred_y[i][j]
    texts.append(text)
    labels.append(ents)

In [20]:
ls = []

In [21]:
for ents in labels:
    e = []
    for ent in ents:
        d = {}
        d['start'] = ent[0]
        d['end'] = ent[1]
        d['label'] = "food"
        e.append(d)
    ls.append(e)

In [22]:
x = [{"text": "But Google is starting from behind.",
       "ents": [{"start":0, "end":3, "label":"food"}, {"start": 4, "end": 10, "label": "ambience"}, {"start": 23, "end": 27, "label": "drinks"} ],
       "title": "Review"}]

In [23]:
data_encoding = []
for i in range(len(texts)):
    text = texts[i]
    ents = ls[i]
    d = {}
    d['text'] = text
    d['ents'] = ents
    d['title'] = "Review"
    data_encoding.append(d)

In [24]:
data_encoding[:10]

[{'text': 'Yum ! ', 'ents': [], 'title': 'Review'},
 {'text': 'Serves really good sushi . ',
  'ents': [{'start': 0, 'end': 6, 'label': 'food'},
   {'start': 19, 'end': 24, 'label': 'food'}],
  'title': 'Review'},
 {'text': 'Not the biggest portions but adequate . ',
  'ents': [{'start': 16, 'end': 24, 'label': 'food'}],
  'title': 'Review'},
 {'text': 'Green Tea creme brulee is a must ! ',
  'ents': [{'start': 0, 'end': 22, 'label': 'food'}],
  'title': 'Review'},
 {'text': "Do n't leave the restaurant without it . ",
  'ents': [],
  'title': 'Review'},
 {'text': 'No Comparison ', 'ents': [], 'title': 'Review'},
 {'text': "– I ca n't say enough about this place . ",
  'ents': [{'start': 33, 'end': 38, 'label': 'food'}],
  'title': 'Review'},
 {'text': 'It has great sushi and even better service . ',
  'ents': [{'start': 13, 'end': 18, 'label': 'food'},
   {'start': 35, 'end': 42, 'label': 'food'}],
  'title': 'Review'},
 {'text': 'The entire staff was extremely accomodating and tended

In [25]:
for x in data_encoding[:10]:
    html = displacy.render(x, style="ent", manual=True, jupyter=True)