In [1]:
import pandas as pd

# Data

In [2]:
train = pd.read_csv("data/preprocessed_training_set.csv", encoding="utf-8")
dev = pd.read_csv("data/preprocessed_dev_set.csv", encoding="utf-8")
test = pd.read_csv("data/preprocessed_test_set.csv", encoding="utf-8")

# Preprocessing

In [3]:
from ast import literal_eval

X_train = train["Sentence"]
y_train = train["Aspect"].apply(literal_eval)

X_dev = dev["Sentence"]
y_dev = dev["Aspect"].apply(literal_eval)

X_test = test["Sentence"]
y_test = test["Aspect"].apply(literal_eval)

## Label encoding

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit(y_train)

y_train = mlb.transform(y_train)
y_dev = mlb.transform(y_dev)
y_test = mlb.transform(y_test)

In [5]:
# Getting a sense of how the tags data looks like
print(y_test[-1])
print(mlb.inverse_transform(y_test[-1].reshape(1, -1)))
print(mlb.classes_)

[0 0 0 0 0 0 1 0 0 0 1 0]
[('FOOD#STYLE&OPTIONS', 'RESTAURANT#PRICES')]
['AMBIENCE#GENERAL' 'DRINKS#PRICES' 'DRINKS#QUALITY'
 'DRINKS#STYLE&OPTIONS' 'FOOD#PRICES' 'FOOD#QUALITY' 'FOOD#STYLE&OPTIONS'
 'LOCATION#GENERAL' 'RESTAURANT#GENERAL' 'RESTAURANT#MISCELLANEOUS'
 'RESTAURANT#PRICES' 'SERVICE#GENERAL']


# Model

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [7]:
pipeline = []
pipeline.append(("CountVectorizer",
                 CountVectorizer(ngram_range=(1, 2), max_df=0.5, min_df=5)))
pipeline.append(("tfidf",
                 TfidfTransformer(use_idf=False,
                                  sublinear_tf=True,
                                  norm="l2",
                                  smooth_idf=True)))
pipeline.append(("classifier", OneVsRestClassifier(LinearSVC())))
clf = Pipeline(pipeline)

# Training

In [8]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation

In [9]:
from sklearn.metrics import f1_score, classification_report, accuracy_score

In [10]:
print(accuracy_score(y_test, y_pred))

0.5252837977296182


In [11]:
print(f1_score(y_test, y_pred, average="micro"))

0.7342304457527333


In [12]:
print(
    classification_report(y_test,
                          y_pred,
                          target_names=mlb.classes_,
                          zero_division=0))

                          precision    recall  f1-score   support

        AMBIENCE#GENERAL       0.94      0.77      0.85       227
           DRINKS#PRICES       0.50      0.13      0.20        47
          DRINKS#QUALITY       0.78      0.72      0.75       203
    DRINKS#STYLE&OPTIONS       0.77      0.56      0.65       129
             FOOD#PRICES       0.56      0.17      0.26       112
            FOOD#QUALITY       0.80      0.79      0.79       554
      FOOD#STYLE&OPTIONS       0.78      0.68      0.72       437
        LOCATION#GENERAL       0.96      0.68      0.80       104
      RESTAURANT#GENERAL       0.75      0.53      0.62       251
RESTAURANT#MISCELLANEOUS       0.86      0.50      0.63       145
       RESTAURANT#PRICES       0.85      0.65      0.74       117
         SERVICE#GENERAL       0.92      0.81      0.86       303

               micro avg       0.82      0.66      0.73      2629
               macro avg       0.79      0.58      0.66      2629
        

In [13]:
import numpy as np

pred = mlb.inverse_transform(np.array(y_pred))
act = mlb.inverse_transform(y_test)

df = pd.DataFrame({"Body": X_test, "Actual": act, "Predicted": pred})
df.sample(10)

Unnamed: 0,Body,Actual,Predicted
1698,hình_như nhiệt_độ không chuẩn lắm gọi medium r...,"(FOOD#QUALITY,)","(FOOD#QUALITY, FOOD#STYLE&OPTIONS)"
646,giá cũng phải_chăng so với chất_lượng nhu vậy .,"(RESTAURANT#PRICES,)","(RESTAURANT#PRICES,)"
388,cửa_hàng ở trong ngõ nhưng khá dễ tìm .,"(LOCATION#GENERAL,)","(LOCATION#GENERAL,)"
1446,ăn khá ngon và mình rất thích .,"(FOOD#QUALITY,)","(FOOD#QUALITY,)"
1126,ở đây nếu không dùng buffet cũng có_thể gọi mó...,"(RESTAURANT#GENERAL,)",()
84,"ngày_xưa quán bán ở ngã tư cô giang , đề thám .","(LOCATION#GENERAL,)","(RESTAURANT#GENERAL,)"
1752,"nằm ngay trên vỉa_hè hàng lược , đây là món ăn...","(LOCATION#GENERAL,)","(LOCATION#GENERAL,)"
715,salad ngon đặc_biệt có bò nướng tái trên bề_mặt .,"(FOOD#QUALITY, FOOD#STYLE&OPTIONS)","(FOOD#QUALITY,)"
1001,đồ uống không có gì đặc_sắc cả .,"(DRINKS#QUALITY,)","(DRINKS#QUALITY,)"
688,phải nói là hôm đấy cực_kỳ đông .,"(RESTAURANT#GENERAL,)","(RESTAURANT#GENERAL,)"
