In [45]:
import torch
import time
import json
import numpy as np
import math
import random
import xml.etree.ElementTree as ET
import subprocess
from subprocess import check_output
from subprocess import CalledProcessError
from bs4 import BeautifulSoup
import nltk
import re
import pandas as pd

In [2]:
np.random.seed(1337)
random.seed(1337)
torch.manual_seed(1337)
torch.cuda.manual_seed(1337)

In [26]:
def generate_corpus(filename):
    corpus = []
    aspects = []
    soup = BeautifulSoup(open(filename, "r"), "lxml")
    for r in soup.find_all('review'):
        doc = ""
        asp = []
        for sentence in r.find_all('text'):
            corpus.append(sentence.text)
    return corpus

In [49]:
def parse_input(filename):
    rows = []
    soup = BeautifulSoup(open(filename, "r"), "lxml")
    for r in soup.find_all('review'):
#         doc = ""
        for sentence in r.find_all('sentence'):
            aspect_words = []
            aspect_categories = []
            polarities = []
            sent = sentence.text.strip()
#             doc += " " + sent
            for opinion in sentence.find_all('opinion'):
                print(opinion)
                aspect_words.append(opinion.get('target'))
                aspect_categories.append(opinion.get('category')[:opinion.get('category').index('#')])
                polarities.append(opinion.get('polarity'))
            rows.append((sent, aspect_words, aspect_categories, polarities))
            break
        break
                                
    return  pd.DataFrame(rows, columns=['text', 'aspect_words', 'aspect_categories', 'polarities'])

In [50]:
df = parse_input('../data/official_data/ABSA16_Restaurants_Train_SB1_v2.xml')
df.head()

<opinion category="RESTAURANT#GENERAL" from="51" polarity="negative" target="place" to="56"></opinion>


Unnamed: 0,text,aspect_words,aspect_categories,polarities
0,Judging from previous posts this used to be a ...,[place],[RESTAURANT],[negative]


In [27]:
def preprocess(corpus, word2idx, max_len = 83):
    pp_corpus = []
    for i,sent in enumerate(corpus):
#         print(i,sent)
        pp_sent = [word2idx[token] for token in nltk.word_tokenize(sent) if token in word2idx.keys()]
        if len(pp_sent)<83:
            pp_sent+=list(np.zeros((83-len(pp_sent),),dtype=np.int8))
        pp_corpus.append(pp_sent)
    return pp_corpus

In [28]:
corpus = generate_corpus('../data/official_data/ABSA16_Restaurants_Train_SB1_v2.xml')
word_idx_fn = "../data/prep_data/word_idx.json"
with open(word_idx_fn) as f:
    word2idx = json.load(f)
pp_corpus = preprocess(corpus, word2idx)

In [32]:
list(zip(nltk.word_tokenize(corpus[0]), pp_corpus[0]))

[('Judging', 5586),
 ('from', 60),
 ('previous', 1163),
 ('posts', 5360),
 ('this', 230),
 ('used', 336),
 ('to', 45),
 ('be', 456),
 ('a', 21),
 ('good', 16),
 ('place', 239),
 (',', 54),
 ('but', 166),
 ('not', 41),
 ('any', 241),
 ('longer', 998),
 ('.', 19)]