# Config

In [140]:
import os
import pandas as pd
import xmltodict

import spacy
nlp = spacy.load('en')

DATA_DIR = "../data"
TRAINING_DATA = os.path.join(DATA_DIR, "Laptops_Train_v2.xml")
TESTING_DATA = os.path.join(DATA_DIR, "Laptops_Test_Gold.xml")

# Load training data

In [48]:
!head -n 20 ../data/Laptops_Train_v2.xml

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sentences>
    <sentence id="2339">
        <text>I charge it at night and skip taking the cord with me because of the good battery life.</text>
        <aspectTerms>
            <aspectTerm term="cord" polarity="neutral" from="41" to="45"/>
            <aspectTerm term="battery life" polarity="positive" from="74" to="86"/>
        </aspectTerms>
    </sentence>
    <sentence id="812">
        <text>I bought a HP Pavilion DV4-1222nr laptop and have had so many problems with the computer.</text>
    </sentence>
    <sentence id="1316">
        <text>The tech guy then said the service center does not do 1-to-1 exchange and I have to direct my concern to the "sales" team, which is the retail shop which I bought my netbook from.</text>
        <aspectTerms>
            <aspectTerm term="service center" polarity="negative" from="27" to="41"/>
            <aspectTerm term="&quot;sales&quot; team" polarity="negative" fro

In [124]:
with open(TRAINING_DATA) as f:
    training_dict = xmltodict.parse(f.read(), strip_whitespace=False)

In [144]:
def get_bio(tokens, aspect_term_info, verbose=False):
    if len(aspect_term_info) == 0:
        aspect_term_info = [(float("inf"), -1, "None")]
    aspect_term_info = sorted(aspect_term_info)
    labels = []
    aspect_from, aspect_to, aspect_term = aspect_term_info.pop(0)
    
    for t in tokens:
        t_from = t.idx
        t_to = t_from + len(t.text)
        
        if t_from == aspect_from:
            curr_label = "B"
        elif t_from > aspect_from:
            curr_label = "I"
        else:
            curr_label = "O"
        labels.append(curr_label)
        
        if verbose:
            print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(t_from, t_to, aspect_from, aspect_to, t.text, curr_label, aspect_term))
        
        # complete the current aspect term, loading the next one
        if t_to >= aspect_to:
            if len(aspect_term_info) == 0:
                aspect_from, aspect_to, aspect_term = (float("inf"), -1, "None")
            else:
                aspect_from, aspect_to, aspect_term = aspect_term_info.pop(0)
    if len(aspect_term_info) != 0:
        raise ValueError("Missing some aspect terms \n\t{} \n\n\t{} \n\n\t".format(aspect_term_info, tokens, labels))
    return labels

In [145]:
training_raw_df = {
    "id": [],
    "text": [],
    "all_aspects": [],
    "token": [],
    "label": []
}

for sentence in training_dict["sentences"]["sentence"]:
    sentence_id = sentence["@id"]
    sentence_text = sentence["text"]
    
    aspect_info = []
    all_terms = sentence.get("aspectTerms", {}).get("aspectTerm", [])
    if type(all_terms) is not list:
        all_terms = [all_terms]
    for aspect_term in all_terms:
        term = aspect_term["@term"]
        idx_from = int(aspect_term["@from"])
        idx_to = int(aspect_term["@to"])
        aspect_info.append((idx_from, idx_to, term))
    
    sentence_tokens = list(nlp(sentence_text))
    labels = get_bio(sentence_tokens, aspect_info)
    
    for n, t in enumerate(sentence_tokens):
        training_raw_df["id"].append(sentence_id)
        training_raw_df["text"].append(sentence_text)
        training_raw_df["all_aspects"].append([a[-1] for a in aspect_info])
        training_raw_df["token"].append(t)
        training_raw_df["label"].append(labels[n])

In [149]:
training_df = pd.DataFrame(training_raw_df)[["id", "token", "label", "all_aspects", "text"]]

In [152]:
training_df.iloc[180:250]

Unnamed: 0,id,token,label,all_aspects,text
180,425,.,O,[],Sad very SAD.
181,76,I,O,"[features, iChat, Photobooth, garage band]","I even got my teenage son one, because of the ..."
182,76,even,O,"[features, iChat, Photobooth, garage band]","I even got my teenage son one, because of the ..."
183,76,got,O,"[features, iChat, Photobooth, garage band]","I even got my teenage son one, because of the ..."
184,76,my,O,"[features, iChat, Photobooth, garage band]","I even got my teenage son one, because of the ..."
185,76,teenage,O,"[features, iChat, Photobooth, garage band]","I even got my teenage son one, because of the ..."
186,76,son,O,"[features, iChat, Photobooth, garage band]","I even got my teenage son one, because of the ..."
187,76,one,O,"[features, iChat, Photobooth, garage band]","I even got my teenage son one, because of the ..."
188,76,",",O,"[features, iChat, Photobooth, garage band]","I even got my teenage son one, because of the ..."
189,76,because,O,"[features, iChat, Photobooth, garage band]","I even got my teenage son one, because of the ..."
