In [1]:
import pandas as pd

# Data

In [2]:
train = pd.read_csv("data/preprocessed_training_set.csv", encoding="utf-8")
dev = pd.read_csv("data/preprocessed_dev_set.csv", encoding="utf-8")
test = pd.read_csv("data/preprocessed_test_set.csv", encoding="utf-8")

# Preprocessing

In [3]:
from ast import literal_eval

X_train = train["Sentence"]
y_train = train["Polarity"].apply(literal_eval)

X_dev = dev["Sentence"]
y_dev = dev["Polarity"].apply(literal_eval)

X_test = test["Sentence"]
y_test = test["Polarity"].apply(literal_eval)

In [4]:
def get_aspect_polarity(item):
    for k, v in item.items():
        aspect = k
        polarity = v
    return f"{aspect}:{polarity}"


def create_true_label(label):
    return [get_aspect_polarity(item) for item in label]

In [5]:
y_train = y_train.apply(create_true_label)
y_dev = y_dev.apply(create_true_label)
y_test = y_test.apply(create_true_label)

## Label encoding

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit(y_train)

y_train = mlb.transform(y_train)
y_dev = mlb.transform(y_dev)
y_test = mlb.transform(y_test)

In [7]:
# Getting a sense of how the tags data looks like
print(y_test[-1])
print(mlb.inverse_transform(y_test[-1].reshape(1, -1)))
print(mlb.classes_)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
[('FOOD#STYLE&OPTIONS:positive', 'RESTAURANT#PRICES:neutral')]
['AMBIENCE#GENERAL:negative' 'AMBIENCE#GENERAL:neutral'
 'AMBIENCE#GENERAL:positive' 'DRINKS#PRICES:negative'
 'DRINKS#PRICES:neutral' 'DRINKS#PRICES:positive'
 'DRINKS#QUALITY:negative' 'DRINKS#QUALITY:neutral'
 'DRINKS#QUALITY:positive' 'DRINKS#STYLE&OPTIONS:negative'
 'DRINKS#STYLE&OPTIONS:neutral' 'DRINKS#STYLE&OPTIONS:positive'
 'FOOD#PRICES:negative' 'FOOD#PRICES:neutral' 'FOOD#PRICES:positive'
 'FOOD#QUALITY:negative' 'FOOD#QUALITY:neutral' 'FOOD#QUALITY:positive'
 'FOOD#STYLE&OPTIONS:negative' 'FOOD#STYLE&OPTIONS:neutral'
 'FOOD#STYLE&OPTIONS:positive' 'LOCATION#GENERAL:negative'
 'LOCATION#GENERAL:neutral' 'LOCATION#GENERAL:positive'
 'RESTAURANT#GENERAL:negative' 'RESTAURANT#GENERAL:neutral'
 'RESTAURANT#GENERAL:positive' 'RESTAURANT#MISCELLANEOUS:negative'
 'RESTAURANT#MISCELLANEOUS:neutral' 'RESTAURANT#MISCELLANEOUS:positive'
 'RESTAURANT#

# Model

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [9]:
pipeline = []
pipeline.append(("CountVectorizer",
                 CountVectorizer(ngram_range=(1, 2), max_df=0.5, min_df=5)))
pipeline.append(("tfidf",
                 TfidfTransformer(use_idf=False,
                                  sublinear_tf=True,
                                  norm="l2",
                                  smooth_idf=True)))
pipeline.append(("classifier", OneVsRestClassifier(LinearSVC())))
clf = Pipeline(pipeline)

# Training

In [10]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation

In [11]:
from sklearn.metrics import f1_score, classification_report, accuracy_score

In [12]:
print(accuracy_score(y_test, y_pred))

0.31320949432404543


In [13]:
print(f1_score(y_test, y_pred, average="micro"))

0.5212633673215619


In [14]:
print(
    classification_report(y_test,
                          y_pred,
                          target_names=mlb.classes_,
                          zero_division=0))

                                   precision    recall  f1-score   support

        AMBIENCE#GENERAL:negative       0.83      0.27      0.41        37
         AMBIENCE#GENERAL:neutral       0.40      0.13      0.20        30
        AMBIENCE#GENERAL:positive       0.87      0.66      0.75       160
           DRINKS#PRICES:negative       0.00      0.00      0.00         5
            DRINKS#PRICES:neutral       0.50      0.06      0.11        32
           DRINKS#PRICES:positive       0.00      0.00      0.00        10
          DRINKS#QUALITY:negative       0.60      0.12      0.19        26
           DRINKS#QUALITY:neutral       0.75      0.06      0.12        48
          DRINKS#QUALITY:positive       0.63      0.49      0.55       129
    DRINKS#STYLE&OPTIONS:negative       0.00      0.00      0.00        10
     DRINKS#STYLE&OPTIONS:neutral       0.66      0.34      0.45        62
    DRINKS#STYLE&OPTIONS:positive       0.88      0.12      0.22        57
             FOOD#PRICES

In [15]:
import numpy as np

pred = mlb.inverse_transform(np.array(y_pred))
act = mlb.inverse_transform(y_test)

df = pd.DataFrame({"Body": X_test, "Actual": act, "Predicted": pred})
df.sample(10)

Unnamed: 0,Body,Actual,Predicted
982,"tuy_nhiên bù lại không_gian khá lạ , có nhiều ...","(AMBIENCE#GENERAL:positive, RESTAURANT#MISCELL...",()
535,nhà_hàng ko có được cái chỗ giữ xe cho ra_hồn .,"(RESTAURANT#MISCELLANEOUS:negative,)","(RESTAURANT#MISCELLANEOUS:negative,)"
1517,"giá khá mắc nhưng chấp_nhận đc , phục_vụ thân_...","(RESTAURANT#PRICES:negative, SERVICE#GENERAL:p...","(SERVICE#GENERAL:positive,)"
1613,"bánh ship tới nhanh , nóng_hổi vừa ăn .","(FOOD#QUALITY:positive, SERVICE#GENERAL:positive)","(FOOD#QUALITY:positive, SERVICE#GENERAL:positive)"
1499,dở khủng_khiếp trong khi phải xếp_hàng dài gần...,"(RESTAURANT#GENERAL:negative, SERVICE#GENERAL:...","(FOOD#QUALITY:negative,)"
488,"à , ngoài_ra còn có vài món nhậu bình_dân , ch...","(FOOD#STYLE&OPTIONS:neutral,)",()
833,"vị_trí đẹp , thoáng_đãng .","(LOCATION#GENERAL:positive,)","(AMBIENCE#GENERAL:positive, LOCATION#GENERAL:p..."
1847,"tuy_nhiên cục giò mỡ quá , thịt còn hôi mùi bò...","(FOOD#QUALITY:negative, FOOD#STYLE&OPTIONS:pos...","(FOOD#QUALITY:negative,)"
1036,ở đây ngay tầng một vào là thấy rồi các bạn có...,"(RESTAURANT#GENERAL:positive,)",()
157,"khu này đông_đúc và nhiều món ăn phết ấy , lại...","(FOOD#PRICES:positive, FOOD#STYLE&OPTIONS:posi...","(FOOD#STYLE&OPTIONS:positive,)"
