### ASAP

In [1]:
import pandas as pd
import jieba
import re
from tqdm import tqdm
import gensim
import codecs
import zhconv

In [7]:
train_df = pd.read_csv(r"../data/raw/ASAP/train.csv")
dev_df = pd.read_csv(r"../data/raw/ASAP/dev.csv")
test_df = pd.read_csv(r"../data/raw/ASAP/test.csv")

train_df['star'].value_counts().sort_index()

1.0     1219
2.0     1258
3.0     5241
4.0    13362
5.0    15770
Name: star, dtype: int64

In [8]:
# Convert 5-star to three polarities
def f(x):
    if x < 3.:
        return 0
    elif x > 3.:
        return 2
    else:
        return 1

train_df['sentiment'] = train_df['star'].apply(lambda x:f(x))
dev_df['sentiment'] = dev_df['star'].apply(lambda x:f(x))
test_df['sentiment'] = test_df['star'].apply(lambda x:f(x))

train_df['sentiment'].value_counts()

2    29132
1     5241
0     2477
Name: sentiment, dtype: int64

Convert coarse to fine aspects and polarities

In [11]:
def process_df(df):
    def coarse_2_fine(lst):
        # 将该lst内的细粒度的sentiment变为粗粒度的
        lst = list(set([i for i in lst if i!=-2]))
        if len(lst) == 0:
            return -2
        elif len(lst) == 1:
            return lst[0]
        else: # 此时len(lst)只可能为2or3
            lst = list(set([i for i in lst if i!=0])) # 排除掉0
            if len(lst) == 1:
                return lst[0]
            else:
                return 0
    data = []
    for i in df.values:
        item = i[3:21]
        Location = item[:3]
        Service = item[3:7]
        Price = item[7:10]
        Ambience = item[10:14]
        Food = item[14:18]
        res = [coarse_2_fine(i) for i in [Location, Service, Price, Ambience, Food]]

        data.append(res)
    
    sup = pd.DataFrame(data)
    sup.columns = ['Location', 'Service', 'Price', 'Ambience', 'Food']
    
    sup = sup + 1
    
    return sup

In [12]:
train_df = pd.concat([train_df, process_df(train_df)], axis=1)
dev_df = pd.concat([dev_df, process_df(dev_df)], axis=1)
test_df = pd.concat([test_df, process_df(test_df)], axis=1)

In [15]:
train_df.to_csv(r"../data/processed/ASAP_train.csv", index=False)
dev_df.to_csv(r"../data/processed/ASAP_dev.csv", index=False)
test_df.to_csv(r"../data/processed/ASAP_test.csv", index=False)

In [16]:
train_df.head(100).to_csv(r"../data/processed/ASAP_sample.csv", index=False)

### TripDMS

In [2]:
import re
import pandas as pd
import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
def f(filename):
    with open(r"../data/raw/TripDMS/" + filename, 'r') as f:
        text = [i.strip('\n') for i in f.readlines()]
    
    data = []
    for i in text:
        values = i.split('\t')[0].split(" ")
        values = [int(i) for i in values]
        review = i.split('\t')[2].replace('<ssssss>', '')
        values.append(review)
        data.append(values)
    
    df = pd.DataFrame(data)
    df.columns = ['Overall', 'value', 'room', 'location', 'cleanliness', 'checkin', 'service', 'business', 'review']
    
    return df


train_df = f('train') 
dev_df = f('dev')
test_df = f('test')
train_df.shape, dev_df.shape, test_df.shape

((23515, 9), (2939, 9), (2939, 9))

In [4]:
# 5-star to three sentiment polarities
def f_overall(x):
    if x == -1:
        return -1
    elif x < 3.:
        return 0
    elif x > 3.:
        return 2
    else:
        return 1

def f(x):
    if x == -1:
        return -1
    elif x < 3.:
        return 0
    elif x > 3.:
        return 2
    else:
        return 1    
    
c = 'Overall'
train_df[c] = train_df[c].apply(f_overall)
dev_df[c] = dev_df[c].apply(f_overall)
test_df[c] = test_df[c].apply(f_overall)
    

for o in ['value', 'room', 'location', 'cleanliness', 'checkin', 'service', 'business']:
    train_df[o] = train_df[o].apply(f)
    dev_df[o] = dev_df[o].apply(f)
    test_df[o] = test_df[o].apply(f)

In [5]:
train_df.to_csv(r"../data/processed/TripDMS_train.csv", index=False)
dev_df.to_csv(r"../data/processed/TripDMS_dev.csv", index=False)
test_df.to_csv(r"../data/processed/TripDMS_test.csv", index=False)

In [6]:
train_df.head(100).to_csv(r"../data/processed/TripDMS_sample.csv", index=False)

### Generate data for supervised ACSA models

In [1]:
import pandas as pd

In [3]:
train_df = pd.read_excel(r"../data/processed/TripDMS_train.xlsx")
dev_df = pd.read_excel(r"../data/processed/TripDMS_dev.xlsx")
test_df = pd.read_excel(r"../data/processed/TripDMS_test.xlsx")

In [4]:
def f(df):
    data = []
    for index, row in df.iterrows():
        aspects = [row['value'], row['room'], row['location'], row['clean'], row['check in'], row['service'], row['business']]
        for i in range(7):
            if aspects[i] != -2:
                v = aspects[i] + 2
            else:
                v = 0
            data.append([row['review'], row['process_review'], i, v])
    
    new_df = pd.DataFrame(data)
    new_df.columns = ['review', 'process_review', 'asp', 'asp_senti']
    
    return new_df

In [6]:
new_train_df = f(train_df)
new_dev_df = f(dev_df)
new_test_df = f(test_df)

In [10]:
# ACSA don't need 0
new2_df = new_df[new_df['asp_senti'] != 0]
new2_df.shape

(178137, 5)

In [13]:
new2_df['asp_senti'] = new2_df['asp_senti'] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
new2_df['asp_senti'].value_counts()

2    81160
0    54177
1    42800
Name: asp_senti, dtype: int64

In [17]:
new2_df.to_csv(r"./data/processed/TripDMS_for_super.csv", index=False)

### Sample data for low-resource scenario

In [1]:
import pandas as pd

ASAP

In [11]:
asap_train = pd.read_csv(r"../data/processed/ASAP_train.csv")
asap_train_sample = asap_train.sample(frac=0.2)

In [12]:
def f(df):
    data = []
    for index, row in df.iterrows():
        aspects = [row['Location'], row['Service'], row['Price'], row['Ambience'], row['Food']]
        for i in range(5):
            if aspects[i] != -2:
                v = aspects[i] + 1
            else:
                v = -1
            data.append([row['process_review'], i, v])
    
    new_df = pd.DataFrame(data)
    new_df.columns = ['process_review', 'asp', 'asp_senti']
    
    return new_df

In [13]:
asap_super_train_sample = f(asap_train_sample)

In [14]:
asap_super_train_sample = asap_super_train_sample[asap_super_train_sample['asp_senti'] != -1]
asap_super_train_sample.shape

(24273, 3)

In [17]:
asap_train_sample.to_csv(r"../data/processed/ASAP_train_sample.csv", index=False)
asap_super_train_sample.to_csv(r"../data/processed/ASAP_super_train_sample.csv", index=False)

TripDMS

In [18]:
trip_train = pd.read_excel(r"../data/processed/TripDMS_train.xlsx")
trip_train_sample = trip_train.sample(frac=1/7)

In [19]:
def f(df):
    data = []
    for index, row in df.iterrows():
        aspects = [row['value'], row['room'], row['location'], row['clean'], row['check in'], row['service'], row['business']]
        for i in range(7):
            if aspects[i] != -2:
                v = aspects[i] + 1
            else:
                v = -1
            data.append([row['process_review'], i, v])
    
    new_df = pd.DataFrame(data)
    new_df.columns = ['process_review', 'asp', 'asp_senti']
    
    return new_df

In [20]:
trip_super_train_sample = f(trip_train_sample)

In [21]:
trip_super_train_sample = trip_super_train_sample[trip_super_train_sample['asp_senti'] != -1]
trip_super_train_sample.shape

(20327, 3)

In [23]:
trip_train_sample.to_excel(r"../data/processed/TripDMS_train_sample.xlsx", index=False)
trip_super_train_sample.to_excel(r"../data/processed/TripDMS_super_train_sample.xlsx", index=False)