In [373]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
import re
import json
import string
import codecs
import jieba

import torch
from pytorch_transformers import *

from zhon import hanzi
from collections import Counter

import xml.etree.ElementTree as ET, getopt, logging, sys, random, re, copy
from xml.sax.saxutils import escape

In [457]:
BERT_PATH ='../data/embeddings/BERT/bert-base-chinese'
BERT_VOCAB ='bert-base-chinese-vocab.txt'
tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_PATH, BERT_VOCAB))
model = BertModel.from_pretrained(BERT_PATH)


INFO:pytorch_transformers.tokenization_utils:Model name '../data/embeddings/BERT/bert-base-chinese/bert-base-chinese-vocab.txt' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc). Assuming '../data/embeddings/BERT/bert-base-chinese/bert-base-chinese-vocab.txt' is a path or url to a directory containing tokenizer files.
INFO:pytorch_transformers.tokenization_utils:loading file ../data/embeddings/BERT/bert-base-chinese/bert-base-chinese-vocab.txt
INFO:pytorch_transformers.tokenization_utils:loading file ../data/embeddings/BERT/bert-base-chinese/bert-base-chinese-vocab.txt
INFO:pytorch_transformers.tokenization_u

JSONDecodeError: Expecting value: line 1 column 2 (char 1)

In [451]:
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])

In [452]:
input_ids

tensor([[10816,  8311, 13049, 10540,  8229,  9448, 10803]])

In [453]:
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]

tensor([[[ 0.7559, -0.8967,  0.4671,  ...,  0.8768, -0.3557,  0.0315],
         [ 0.0102, -0.9331, -0.1013,  ..., -0.0526, -0.3079,  0.1158],
         [ 0.5664,  0.0461,  0.1432,  ...,  0.2052, -0.4931,  0.0084],
         ...,
         [-0.1225, -0.4007,  0.1629,  ..., -0.3553, -0.5007, -0.0392],
         [ 0.0384, -0.2626, -0.0595,  ..., -0.2261, -0.2866,  0.2040],
         [ 0.5811, -0.1267, -0.2250,  ..., -0.1337, -0.3257,  0.3426]]])

In [416]:
DATA_DIR_FROM = "../data/data_orign/SemEval2015"
DATA_DIR_TO = "../data/data_processed/SemEval2015"

STOPWORDS = codecs.open("Frequent_Stopwords_ZH.txt", encoding="utf-8")\
                  .read()\
                  .replace(" ", "")\
                  .split(",")
PUNCTUATIONS = set(hanzi.punctuation + string.punctuation)

In [417]:
SENTIMENT_IDX = {'negative': 0, 'positive': 1}
IDX_SENTIMENT = {0: 'negative', 1: 'positive'}

In [5]:
class Category:
    '''Category objects contain the term and polarity (i.e., pos, neg, neu, conflict) of the category (e.g., food, price, etc.) of a sentence.'''

    def __init__(self, term='', polarity=''):
        self.term = term
        self.polarity = polarity

    def create(self, element):
        self.term = element.attrib['category']
        self.polarity = element.attrib['polarity']
        return self

    def update(self, term='', polarity=''):
        self.term = term
        self.polarity = polarity


class Aspect:
    '''Aspect objects contain the term (e.g., battery life) and polarity (i.e., pos, neg, neu, conflict) of an aspect.'''

    def __init__(self, term, polarity, offsets):
        self.term = term
        self.polarity = polarity
        self.offsets = offsets

    def create(self, element):
        self.term = element.attrib['term']
        self.polarity = element.attrib['polarity']
        self.offsets = {'from': str(element.attrib['from']), 'to': str(element.attrib['to'])}
        return self

    def update(self, term='', polarity=''):
        self.term = term
        self.polarity = polarity


class Instance:
    '''An instance is a sentence, modeled out of XML (pre-specified format, based on the 4th task of SemEval 2014).
    It contains the text, the aspect terms, and any aspect categories.'''

    def __init__(self, element):
        self.text = element.find('text').text
        self.id = element.get('id')
        self.aspect_terms = [Aspect('', '', offsets={'from': '', 'to': ''}).create(e) for es in
                             element.findall('aspectTerms') for e in es if
                             es is not None]
        self.aspect_categories = [Category(term='', polarity='').create(e) for es in element.findall('aspectCategories')
                                  for e in es if
                                  es is not None]

    def get_aspect_terms(self):
        return [a.term.lower() for a in self.aspect_terms]

    def get_aspect_categories(self):
        return [c.term.lower() for c in self.aspect_categories]

    def add_aspect_term(self, term, polarity='', offsets={'from': '', 'to': ''}):
        a = Aspect(term, polarity, offsets)
        self.aspect_terms.append(a)

    def add_aspect_category(self, term, polarity=''):
        c = Category(term, polarity)
        self.aspect_categories.append(c)


class Corpus:
    '''A corpus contains instances, and is useful for training algorithms or splitting to train/test files.'''

    def __init__(self, elements):
        self.corpus = [Instance(e) for e in elements]
        self.size = len(self.corpus)
        self.aspect_terms_fd = fd([a for i in self.corpus for a in i.get_aspect_terms()])
        self.top_aspect_terms = freq_rank(self.aspect_terms_fd)
        self.texts = [t.text for t in self.corpus]

    def echo(self):
        print '%d instances\n%d distinct aspect terms' % (len(self.corpus), len(self.top_aspect_terms))
        print 'Top aspect terms: %s' % (', '.join(self.top_aspect_terms[:10]))

    def clean_tags(self):
        for i in range(len(self.corpus)):
            self.corpus[i].aspect_terms = []

    def split(self, threshold=0.8, shuffle=False):
        '''Split to train/test, based on a threshold. Turn on shuffling for randomizing the elements beforehand.'''
        clone = copy.deepcopy(self.corpus)
        if shuffle: random.shuffle(clone)
        train = clone[:int(threshold * self.size)]
        test = clone[int(threshold * self.size):]
        return train, test

    def write_out(self, filename, instances, short=True):
        with open(filename, 'w') as o:
            o.write('<sentences>\n')
            for i in instances:
                o.write('\t<sentence id="%s">\n' % (i.id))
                o.write('\t\t<text>%s</text>\n' % fix(i.text))
                o.write('\t\t<aspectTerms>\n')
                if not short:
                    for a in i.aspect_terms:
                        o.write('\t\t\t<aspectTerm term="%s" polarity="%s" from="%s" to="%s"/>\n' % (
                            fix(a.term), a.polarity, a.offsets['from'], a.offsets['to']))
                o.write('\t\t</aspectTerms>\n')
                o.write('\t\t<aspectCategories>\n')
                if not short:
                    for c in i.aspect_categories:
                        o.write('\t\t\t<aspectCategory category="%s" polarity="%s"/>\n' % (fix(c.term), c.polarity))
                o.write('\t\t</aspectCategories>\n')
                o.write('\t</sentence>\n')
            o.write('</sentences>')

    def write_out_json(self, filename, instances):
        fr_to = open(filename, 'w')
        ans = []
        for i in instances:
            aspect_term = []
            aspect_category = []
            for a in i.aspect_terms:
                aspect_term.append({
                    "term": a.term,
                    "polarity": a.polarity,
                    "from": a.offsets["from"],
                    "to": a.offsets["to"]
                })
                # print(a.term, i.text[int(a.offsets['from']): int(a.offsets['to'])])
                # if a.term != i.text[int(a.offsets['from']): int(a.offsets['to'])]:
                #     print a.term
            for c in i.aspect_categories:
                aspect_category.append({
                    "category": c.term,
                    "polarity": c.polarity
                })
            opinions = {"aspect_term": aspect_term, "aspect_category": aspect_category}
            ans.append({
                "id": i.id,
                "text": i.text,
                "opinions": opinions
            })
        import json
        json.dump(ans, fr_to)

SyntaxError: invalid syntax (<ipython-input-5-a04fdb8683c2>, line 126)