In [1]:
!pip install --quiet "pyvi" "gensim" # for kaggle only

[0m

In [2]:
!git clone https://github.com/minhngt62/nlp-vabsa.git # for kaggle only

Cloning into 'nlp-vabsa'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (88/88), done.[K
remote: Total 109 (delta 42), reused 79 (delta 16), pack-reused 0[K
Receiving objects: 100% (109/109), 15.35 MiB | 13.90 MiB/s, done.
Resolving deltas: 100% (42/42), done.


In [3]:
%cd "/kaggle/working/nlp-vabsa" # for kaggle only

/kaggle/working/nlp-vabsa


In [4]:
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import pandas as pd
from vabsa.ml.vectorizers import DenseTfidfVectorizer, LDA
from vabsa.ml.utils import multioutput_f1
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import os
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform
import joblib
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier

# 1. Load the Data

In [5]:
#@title Load preprocessed data
def load_csv(path):
    df = pd.read_csv(path)
    del df[df.columns[0]]
    return df

df_train = load_csv("/kaggle/working/nlp-vabsa/datasets/preprocessed/train.csv")
df_dev = load_csv("/kaggle/working/nlp-vabsa/datasets/preprocessed/dev.csv")
df_test = load_csv("/kaggle/working/nlp-vabsa/datasets/preprocessed/test.csv")

df_train.head(2)

Unnamed: 0,content,RESTAURANT#PRICES,FOOD#STYLE&OPTIONS,DRINKS#PRICES,RESTAURANT#MISCELLANEOUS,RESTAURANT#GENERAL,FOOD#PRICES,FOOD#QUALITY,AMBIENCE#GENERAL,DRINKS#QUALITY,DRINKS#STYLE&OPTIONS,LOCATION#GENERAL,SERVICE#GENERAL
0,ảnh chụp từ hôm qua đi chơi với gia đình và nh...,0,3,0,0,0,0,3,0,0,0,0,0
1,hương vị thơm ngon ăn cay cay rất thích nêm nế...,3,2,0,0,2,3,2,2,0,0,0,1


In [6]:
# Concatenate train and dev data
df_train = pd.concat([df_train, df_dev])
df_train.shape

(4251, 13)

In [7]:
#@title Inspect the fields
df_train.columns

Index(['content', 'RESTAURANT#PRICES', 'FOOD#STYLE&OPTIONS', 'DRINKS#PRICES',
       'RESTAURANT#MISCELLANEOUS', 'RESTAURANT#GENERAL', 'FOOD#PRICES',
       'FOOD#QUALITY', 'AMBIENCE#GENERAL', 'DRINKS#QUALITY',
       'DRINKS#STYLE&OPTIONS', 'LOCATION#GENERAL', 'SERVICE#GENERAL'],
      dtype='object')

# 2. Preprocess the Data

## 2.1. Label extraction

In [8]:
#@title Extract the labels
def labeling(df):
    y = df.iloc[:, 1:].to_numpy()
    return y

y_train = labeling(df_train)
y_test = labeling(df_test)

y_train.shape

(4251, 12)

## 2.1. Text processing

In [9]:
# Re-train from the best parameters
best_lda = LDA(**{'alpha': 1.1, 'eta': 0.6, 'iterations': 1000, 'num_topics': 48})
best_lda.fit(df_train["content"])
best_lda.score()

0.4098865782430356

In [10]:
#@title Transform the data
std_scaler = StandardScaler()

def preprocess_text(df, extractor):
    features = [extractor.predict(x) for x in df["content"]]
    return features

X_train = std_scaler.fit_transform(preprocess_text(df_train, best_lda))
X_test = std_scaler.transform(preprocess_text(df_test, best_lda))

X_train[0].shape

(48,)

# 3. Evaluate the Model on the Test Set

## 3.1. Aspect-Polarity

In [11]:
#@title Random Forest
multi_rnd = MultiOutputClassifier(RandomForestClassifier(**{'criterion': 'entropy', 'max_depth': 75, 'max_features': 'log2', 'max_leaf_nodes': 104, 'max_samples': 0.3090971804694158, 'min_impurity_decrease': 0.0072733328339409644, 'min_samples_split': 39, 'n_estimators': 315, 'random_state': 34}), n_jobs=-1)
multi_rnd.fit(X_train, y_train)
multioutput_f1(y_test, multi_rnd.predict(X_test), f1_only=False)

{'f1': 0.42971784168885824,
 'precision': 0.6488333035763875,
 'recall': 0.3358214197776465}

In [14]:
#@title Support Vector Machines
multi_svc = MultiOutputClassifier(SVC(**{'C': 2.6620236768172543, 'gamma': 'auto', 'kernel': 'rbf', 'random_state': 38}), n_jobs=-1)
multi_svc.fit(X_train, y_train)
multioutput_f1(y_test, multi_svc.predict(X_test), f1_only=False)

{'f1': 0.5505135488205711,
 'precision': 0.7387999715095542,
 'recall': 0.4625121093535895}

## 3.2. Aspect-only

In [12]:
#@title Random Forest
aspect_pred = np.where(multi_rnd.predict(X_test) > 0, 1, 0)
aspect = np.where(y_test > 0, 1, 0)
multioutput_f1(aspect, aspect_pred, f1_only=False)

{'f1': 0.5299946030816012,
 'precision': 0.7973332970388864,
 'recall': 0.4144756024327603}

In [20]:
#@title Support Vector Machines
aspect_pred = np.where(multi_svc.predict(X_test) > 0, 1, 0)
aspect = np.where(y_test > 0, 1, 0)
multioutput_f1(aspect, aspect_pred, f1_only=False)

{'f1': 0.6081985152342393,
 'precision': 0.7551999775871113,
 'recall': 0.509215863827845}