In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

In [2]:
all_stopwords = stopwords.words("english")
def text_process(text):
    text = " ".join([w for w in text if w not in all_stopwords])
    return text.strip('.').strip()


### SemEval 2014

train

In [3]:
with open(r"../data/raw/SemEval-2014-Task-4-REST/SemEval'14-ABSA-TrainData_v2 & AnnotationGuidelines/Restaurants_Train_v2.xml") as f:
    xml = f.read()
    
soup = BeautifulSoup(xml, 'html.parser')
sentences = soup.find_all("sentence")
IDs = []
texts = []
cats = []
pols = []

for s in sentences:
    items = s.find_all("aspectcategory")
    for i in items:
        IDs.append(s.attrs['id'])
        texts.append(s.find("text").text)
        cats.append(i.attrs['category'])
        pols.append(i.attrs['polarity'])

In [4]:
rest_14_train = pd.DataFrame({"ID":IDs, "Text":texts, "Aspect":cats, "Polarity":pols})
rest_14_train.shape

(3713, 4)

test

In [5]:
with open(r"../data/raw/SemEval-2014-Task-4-REST/ABSA_Gold_TestData/Restaurants_Test_Gold.xml") as f:
    xml = f.read()

soup = BeautifulSoup(xml, 'html.parser')
sentences = soup.find_all("sentence")
IDs = []
texts = []
cats = []
pols = []

for s in sentences:
    items = s.find_all("aspectcategory")
    for i in items:
        IDs.append(s.attrs['id'])
        texts.append(s.find("text").text)
        cats.append(i.attrs['category'])
        pols.append(i.attrs['polarity'])

In [6]:
rest_14_test = pd.DataFrame({"ID":IDs, "Text":texts, "Aspect":cats, "Polarity":pols})
rest_14_test.shape

(1025, 4)

In [7]:
# convert data to 1 vs multi, one sentence for multiple aspects即一个句子对应多个aspects
def f(x):
    if x == 'positive':
        return 1
    elif x == 'negative':
        return -1
    else:
        return 0
    

def get_info(df):
    IDs = df['ID'].unique()
    big = []
    for ID in IDs:
        result = [-2, -2, -2, -2, -2]
        order = ['service', 'food', 'anecdotes/miscellaneous', 'price', 'ambience']
        item = df[df['ID'] == ID]
        for index, row in item.iterrows():
            result[order.index(row['Aspect'])] = f(row['Polarity'])
        result += [ID, item['Text'].values[0]]
        big.append(result)
    res = pd.DataFrame(big)
    res.columns = order + ['ID', 'Text']
    
    return res


rest_14_train = get_info(rest_14_train)
rest_14_test = get_info(rest_14_test)

In [8]:
# generate overall rating
# most samples only has one aspect
# we use majority score here

def get_overall(df):
    overall = []
    for item in df.values:
        item = item[:5]
        v = sum([i for i in item if i != -2])
        if v >= 1:
            overall.append(2)
        elif v == 0:
            overall.append(1)
        else:
            overall.append(0)
    
    return overall

rest_14_train['Overall'] = get_overall(rest_14_train)
rest_14_test['Overall'] = get_overall(rest_14_test)

In [9]:
rest_14_train['process_review'] = rest_14_train['Text'].str.lower().replace("""[;:)"’!.?,‘”“(><_'-+/]""", "", regex=True).apply(word_tokenize).apply(text_process)
rest_14_test['process_review'] = rest_14_test['Text'].str.lower().replace("""[;:)"’!.?,‘”“(><_'-+/]""", "", regex=True).apply(word_tokenize).apply(text_process)

In [16]:
rest_14_train['n_of_tokens'] = rest_14_train['process_review'].str.split().apply(lambda x:len(x))
rest_14_test['n_of_tokens'] = rest_14_test['process_review'].str.split().apply(lambda x:len(x))

rest_14_train = rest_14_train[rest_14_train['n_of_tokens']>0]
rest_14_test = rest_14_test[rest_14_test['n_of_tokens']>0]

In [20]:
# Get dev set
rest_14_dev = rest_14_train.sample(frac=0.1, random_state=1)
rest_14_train.drop(rest_14_dev.index, axis=0, inplace=True)

rest_14_train.shape, rest_14_dev.shape, rest_14_test.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


((2734, 10), (304, 10), (800, 10))

In [21]:
# Generate corpus for ABAE
all_reviews = rest_14_train['process_review'].values.tolist()
with open("../data/processed/rest_14_comments.txt", 'w', encoding='utf-8') as f:
    for r in all_reviews:
        f.write(str(r))
        f.write('\n')
        
rest_14_train.to_csv(r"../data/processed/rest_14_train.csv", index=False)
rest_14_dev.to_csv(r"../data/processed/rest_14_dev.csv", index=False)
rest_14_test.to_csv(r"../data/processed/rest_14_test.csv", index=False)

### SemEval 2015

train

In [22]:
with open(r"../data/raw/SemEval-2015-Task-12-REST/ABSA15_RestaurantsTrain/ABSA-15_Restaurants_Train_Final.xml") as f:
    xml = f.read()

soup = BeautifulSoup(xml, 'html.parser')
sentences = soup.find_all("sentence")
IDs = []
texts = []
cats = []
pols = []

for s in sentences:
    items = s.find_all("opinion")
    for i in items:
        IDs.append(s.attrs['id'])
        texts.append(s.find("text").text)
        cats.append(i.attrs['category'])
        pols.append(i.attrs['polarity'])

In [23]:
rest_15_train = pd.DataFrame({"ID":IDs, "Text":texts, "Aspect":cats, "Polarity":pols})
rest_15_train.shape

(1654, 4)

test

In [24]:
with open(r"../data/raw/SemEval-2015-Task-12-REST/ABSA15_Restaurants_Test.xml", encoding='utf-8') as f:
    xml = f.read()

soup = BeautifulSoup(xml, 'html.parser')
sentences = soup.find_all("sentence")
IDs = []
texts = []
cats = []
pols = []

for s in sentences:
    items = s.find_all("opinion")
    for i in items:
        IDs.append(s.attrs['id'])
        texts.append(s.find("text").text)
        cats.append(i.attrs['category'])
        pols.append(i.attrs['polarity'])

In [25]:
rest_15_test = pd.DataFrame({"ID":IDs, "Text":texts, "Aspect":cats, "Polarity":pols})
rest_15_test.shape

(845, 4)

In [26]:
# rest 14直接是aspect，但15，16出现了category和aspect，所以需多一个处理
def f(x):
    return x.split("#")[0]

rest_15_train['Aspect_Category'] = rest_15_train['Aspect'].apply(lambda x:f(x))
rest_15_test['Aspect_Category'] = rest_15_test['Aspect'].apply(lambda x:f(x))

In [27]:
# 扭转数据集至1对多
# 即一个句子对应多个aspect（如果有多个的话）
def f(x):
    if x == 'positive':
        return 1
    elif x == 'negative':
        return -1
    else:
        return 0
    

def get_info(df):
    IDs = df['ID'].unique()
    big = []
    for ID in IDs:
        result = [-2, -2, -2, -2, -2, -2]
        order = ['RESTAURANT', 'SERVICE', 'FOOD', 'DRINKS', 'AMBIENCE', 'LOCATION']
        item = df[df['ID'] == ID]
        for index, row in item.iterrows():
            result[order.index(row['Aspect_Category'])] = f(row['Polarity'])
        result += [ID, item['Text'].values[0]]
        big.append(result)
    res = pd.DataFrame(big)
    res.columns = order + ['ID', 'Text']
    
    return res


rest_15_train = get_info(rest_15_train)
rest_15_test = get_info(rest_15_test)

In [28]:
# 生成overall rating
# 大部分都仅有一个aspect，这里我们就把非-2的项加起来作为overall
def get_overall(df):
    overall = []
    for item in df.values:
        item = item[:6]
        v = sum([i for i in item if i != -2])
        if v >= 1:
            overall.append(2)
        elif v == 0:
            overall.append(1)
        else:
            overall.append(0)
    
    return overall

rest_15_train['Overall'] = get_overall(rest_15_train)
rest_15_test['Overall'] = get_overall(rest_15_test)

In [29]:
# 文本处理
rest_15_train['process_review'] = rest_15_train['Text'].str.lower().replace("""[;:)"’!.?,‘”“(><_'-+/]""", "", regex=True).apply(word_tokenize).apply(text_process)
rest_15_test['process_review'] = rest_15_test['Text'].str.lower().replace("""[;:)"’!.?,‘”“(><_'-+/]""", "", regex=True).apply(word_tokenize).apply(text_process)

In [30]:
# 把文本处理后为0的直接删除
rest_15_train['n_of_tokens'] = rest_15_train['process_review'].str.split().apply(lambda x:len(x))
rest_15_test['n_of_tokens'] = rest_15_test['process_review'].str.split().apply(lambda x:len(x))

rest_15_train = rest_15_train[rest_15_train['n_of_tokens']>0]
rest_15_test = rest_15_test[rest_15_test['n_of_tokens']>0]

In [31]:
# 分出dev set
rest_15_dev = rest_15_train.sample(frac=0.1, random_state=1)
rest_15_train.drop(rest_15_dev.index, axis=0, inplace=True)

rest_15_train.shape, rest_15_dev.shape, rest_15_test.shape

((1007, 11), (112, 11), (582, 11))

In [32]:
# 生成ABAE需要的语料
all_reviews = rest_15_train['process_review'].values.tolist()
with open("../data/processed/rest_15_comments.txt", 'w', encoding='utf-8') as f:
    for r in all_reviews:
        f.write(str(r))
        f.write('\n')
        
rest_15_train.to_csv(r"../data/processed/rest_15_train.csv", index=False)
rest_15_dev.to_csv(r"../data/processed/rest_15_dev.csv", index=False)
rest_15_test.to_csv(r"../data/processed/rest_15_test.csv", index=False)

### SemEval 2016

train

In [33]:
with open(r"../data/raw/SemEval-2016-Task-5-REST-SB1/ABSA16_Restaurants_Train_SB1_v2.xml", encoding='utf-8') as f:
    xml = f.read()

soup = BeautifulSoup(xml, 'html.parser')
sentences = soup.find_all("sentence")
IDs = []
texts = []
cats = []
pols = []

for s in sentences:
    items = s.find_all("opinion")
    for i in items:
        IDs.append(s.attrs['id'])
        texts.append(s.find("text").text)
        cats.append(i.attrs['category'])
        pols.append(i.attrs['polarity'])

In [34]:
rest_16_train = pd.DataFrame({"ID":IDs, "Text":texts, "Aspect":cats, "Polarity":pols})
rest_16_train.shape

(2507, 4)

test

In [35]:
with open(r"../data/raw/SemEval-2016-Task-5-REST-SB1/EN_REST_SB1_TEST.xml", encoding='utf-8') as f:
    xml = f.read()

soup = BeautifulSoup(xml, 'html.parser')
sentences = soup.find_all("sentence")
IDs = []
texts = []
cats = []
pols = []

for s in sentences:
    items = s.find_all("opinion")
    for i in items:
        IDs.append(s.attrs['id'])
        texts.append(s.find("text").text)
        cats.append(i.attrs['category'])
        pols.append(i.attrs['polarity'])

In [36]:
rest_16_test = pd.DataFrame({"ID":IDs, "Text":texts, "Aspect":cats, "Polarity":pols})
rest_16_test.shape

(859, 4)

In [37]:
# rest 14直接是aspect，但15，16出现了category和aspect，所以需多一个处理
def f(x):
    return x.split("#")[0]

rest_16_train['Aspect_Category'] = rest_16_train['Aspect'].apply(lambda x:f(x))
rest_16_test['Aspect_Category'] = rest_16_test['Aspect'].apply(lambda x:f(x))

In [38]:
# 扭转数据集至1对多
# 即一个句子对应多个aspect（如果有多个的话）
def f(x):
    if x == 'positive':
        return 1
    elif x == 'negative':
        return -1
    else:
        return 0
    

def get_info(df):
    IDs = df['ID'].unique()
    big = []
    for ID in IDs:
        result = [-2, -2, -2, -2, -2, -2]
        order = ['RESTAURANT', 'SERVICE', 'FOOD', 'DRINKS', 'AMBIENCE', 'LOCATION']
        item = df[df['ID'] == ID]
        for index, row in item.iterrows():
            result[order.index(row['Aspect_Category'])] = f(row['Polarity'])
        result += [ID, item['Text'].values[0]]
        big.append(result)
    res = pd.DataFrame(big)
    res.columns = order + ['ID', 'Text']
    
    return res


rest_16_train = get_info(rest_16_train)
rest_16_test = get_info(rest_16_test)

In [39]:
# 生成overall rating
# 大部分都仅有一个aspect，这里我们就把非-2的项加起来作为overall
def get_overall(df):
    overall = []
    for item in df.values:
        item = item[:6]
        v = sum([i for i in item if i != -2])
        if v >= 1:
            overall.append(2)
        elif v == 0:
            overall.append(1)
        else:
            overall.append(0)
    
    return overall

rest_16_train['Overall'] = get_overall(rest_16_train)
rest_16_test['Overall'] = get_overall(rest_16_test)

In [40]:
# 文本处理
rest_16_train['process_review'] = rest_16_train['Text'].str.lower().replace("""[;:)"’!.?,‘”“(><_'-+/]""", "", regex=True).apply(word_tokenize).apply(text_process)
rest_16_test['process_review'] = rest_16_test['Text'].str.lower().replace("""[;:)"’!.?,‘”“(><_'-+/]""", "", regex=True).apply(word_tokenize).apply(text_process)

In [41]:
# 把文本处理后为0的直接删除
rest_16_train['n_of_tokens'] = rest_16_train['process_review'].str.split().apply(lambda x:len(x))
rest_16_test['n_of_tokens'] = rest_16_test['process_review'].str.split().apply(lambda x:len(x))

rest_16_train = rest_16_train[rest_16_train['n_of_tokens']>0]
rest_16_test = rest_16_test[rest_16_test['n_of_tokens']>0]

In [42]:
# 分出dev set
rest_16_dev = rest_16_train.sample(frac=0.1, random_state=1)
rest_16_train.drop(rest_16_dev.index, axis=0, inplace=True)

rest_16_train.shape, rest_16_dev.shape, rest_16_test.shape

((1535, 11), (171, 11), (587, 11))

In [43]:
# 生成ABAE需要的语料
all_reviews = rest_16_train['process_review'].values.tolist()
with open("../data/processed/rest_16_comments.txt", 'w', encoding='utf-8') as f:
    for r in all_reviews:
        f.write(str(r))
        f.write('\n')
        
rest_16_train.to_csv(r"../data/processed/rest_16_train.csv", index=False)
rest_16_dev.to_csv(r"../data/processed/rest_16_dev.csv", index=False)
rest_16_test.to_csv(r"../data/processed/rest_16_test.csv", index=False)

### MAMS

In [44]:
def read(path):
    with open(path) as f:
        xml = f.read()
    soup = BeautifulSoup(xml, 'xml')
    return soup
        
train_soup = read("../data/raw/MAMS/train.xml")
val_soup = read("../data/raw/MAMS/val.xml")
test_soup = read("../data/raw/MAMS/test.xml")

In [45]:
# 将xml变为dataframe
def process(soup, split='train'):
    def pol_2_sco(pol):
        if pol == 'neutral':
            return 0
        elif pol == 'negative':
            return -1
        else:
            return 1
    
    values = []
    for item in soup.find_all("sentence"):
        dic = {'ambience':-2, 'food':-2, 'menu':-2, 'miscellaneous':-2, 'place':-2, 'price':-2, 'service':-2, 'staff':-2}
        groups = item.find_all("aspectCategory")
        for g in groups:
            dic[g.attrs['category']] = pol_2_sco(g.attrs['polarity'])

        lst = [item.find("text").text] + [split] + list(dic.values())
        values.append(lst)
    
    return values

In [46]:
train_values = process(train_soup, split='train')
val_values = process(val_soup, split='val')
test_values = process(test_soup, split='test')

len(train_values), len(val_values), len(test_values)

(3149, 400, 400)

In [47]:
values = train_values + val_values + test_values
df = pd.DataFrame(values)
df.columns = ['text', 'split', 'ambience', 'food', 'menu', 'miscellaneous', 'place', 'price', 'service', 'staff']

In [48]:
# mams本来就长相很好，不需要扭转
# 生成overall rating
# 虽然不像rest，mams大部分都有两个aspect，但这里处理方法还是一样。把非-2的项加起来作为overall
def get_overall(df):
    overall = []
    for item in df.values:
        item = item[2:]
        v = sum([i for i in item if i != -2])
        if v >= 1:
            overall.append(2)
        elif v == 0:
            overall.append(1)
        else:
            overall.append(0)
    
    return overall

df['Overall'] = get_overall(df)

In [49]:
# 文本处理
df['process_review'] = df['text'].str.lower().replace("""[;:)"’!.?,‘”“(><_'-+/]""", "", regex=True).apply(word_tokenize).apply(text_process)

In [51]:
# 把文本处理后为0的直接删除
df['n_of_tokens'] = df['process_review'].str.split().apply(lambda x:len(x))
df = df[df['n_of_tokens']>0]

In [53]:
# 生成ABAE需要的语料
all_reviews = df[df['split'] == 'train']['process_review'].values.tolist()
with open("../data/processed/mams_comments.txt", 'w', encoding='utf-8') as f:
    for r in all_reviews:
        f.write(str(r))
        f.write('\n')
        

df.to_csv(r"../data/processed/mams.csv", index=False)