In [1]:
!pip install --quiet "pyvi" "gensim" # for kaggle only

[0m

In [2]:
!git clone https://github.com/minhngt62/nlp-vabsa.git # for kaggle only

Cloning into 'nlp-vabsa'...
remote: Enumerating objects: 85, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 85 (delta 34), reused 61 (delta 14), pack-reused 0[K
Unpacking objects: 100% (85/85), 1.31 MiB | 4.91 MiB/s, done.


In [3]:
%cd "/kaggle/working/nlp-vabsa" # for kaggle only

/kaggle/working/nlp-vabsa


In [4]:
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import pandas as pd
from vabsa.ml.vectorizers import DenseTfidfVectorizer, LDA
from vabsa.ml.utils import multioutput_f1
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import os
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform
import joblib
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier

# 1. Load the Data

In [5]:
#@title Load preprocessed data
def load_csv(path):
    df = pd.read_csv(path)
    del df[df.columns[0]]
    return df

df_train = load_csv("/kaggle/working/nlp-vabsa/datasets/preprocessed/train.csv")
df_dev = load_csv("/kaggle/working/nlp-vabsa/datasets/preprocessed/dev.csv")
df_test = load_csv("/kaggle/working/nlp-vabsa/datasets/preprocessed/test.csv")

df_train.head(2)

Unnamed: 0,content,RESTAURANT#PRICES,FOOD#STYLE&OPTIONS,DRINKS#PRICES,RESTAURANT#MISCELLANEOUS,RESTAURANT#GENERAL,FOOD#PRICES,FOOD#QUALITY,AMBIENCE#GENERAL,DRINKS#QUALITY,DRINKS#STYLE&OPTIONS,LOCATION#GENERAL,SERVICE#GENERAL
0,ảnh chụp từ hôm qua đi chơi với gia đình và nh...,0,3,0,0,0,0,3,0,0,0,0,0
1,hương vị thơm ngon ăn cay cay rất thích nêm nế...,3,2,0,0,2,3,2,2,0,0,0,1


In [6]:
# Concatenate train and dev data
df_train = pd.concat([df_train, df_dev])
df_train.shape

(4251, 13)

In [7]:
#@title Inspect the fields
df_train.columns

Index(['content', 'RESTAURANT#PRICES', 'FOOD#STYLE&OPTIONS', 'DRINKS#PRICES',
       'RESTAURANT#MISCELLANEOUS', 'RESTAURANT#GENERAL', 'FOOD#PRICES',
       'FOOD#QUALITY', 'AMBIENCE#GENERAL', 'DRINKS#QUALITY',
       'DRINKS#STYLE&OPTIONS', 'LOCATION#GENERAL', 'SERVICE#GENERAL'],
      dtype='object')

# 2. Preprocess the Data

## 2.1. Label extraction

In [8]:
#@title Extract the labels
def labeling(df):
    y = df.iloc[:, 1:].to_numpy()
    return y

y_train = labeling(df_train)
y_test = labeling(df_test)

y_train.shape

(4251, 12)

## 2.1. Text processing

In [9]:
#@title Cross-validate for LDA (based on coherence)
params = {
    'num_topics': [48], 
    'alpha': [i / 10 for i in range(1, 16)],
    'eta': [i / 10 for i in range(1, 11)], 
    'iterations': [1000]
}
selector = GridSearchCV(LDA(), params, verbose=2)
selector.fit(df_train["content"])
best_lda = selector.best_estimator_
best_lda.save()
print("Best coherence:", selector.best_score_)
print("Best parameters:", selector.best_params_)

#best_lda = LDA(num_topics=48, alpha=1.1, eta=0.5, iterations=1000)
#est_lda.fit(df_train["content"])

Fitting 5 folds for each of 150 candidates, totalling 750 fits
[CV] END .alpha=0.1, eta=0.1, iterations=1000, num_topics=48; total time=  17.8s
[CV] END .alpha=0.1, eta=0.1, iterations=1000, num_topics=48; total time=  17.6s
[CV] END .alpha=0.1, eta=0.1, iterations=1000, num_topics=48; total time=  17.2s
[CV] END .alpha=0.1, eta=0.1, iterations=1000, num_topics=48; total time=  17.7s
[CV] END .alpha=0.1, eta=0.1, iterations=1000, num_topics=48; total time=  17.5s
[CV] END .alpha=0.1, eta=0.2, iterations=1000, num_topics=48; total time=  17.9s
[CV] END .alpha=0.1, eta=0.2, iterations=1000, num_topics=48; total time=  17.4s
[CV] END .alpha=0.1, eta=0.2, iterations=1000, num_topics=48; total time=  17.4s
[CV] END .alpha=0.1, eta=0.2, iterations=1000, num_topics=48; total time=  15.9s
[CV] END .alpha=0.1, eta=0.2, iterations=1000, num_topics=48; total time=  19.0s
[CV] END .alpha=0.1, eta=0.3, iterations=1000, num_topics=48; total time=  17.1s
[CV] END .alpha=0.1, eta=0.3, iterations=1000,

In [10]:
#@title Transform the data
std_scaler = StandardScaler()

def preprocess_text(df, extractor):
    features = [extractor.predict(x) for x in df_train["content"]]
    return features

X_train = std_scaler.fit_transform(preprocess_text(df_train["content"], best_lda))
X_test = std_scaler.transform(preprocess_text(df_test["content"], best_lda))

X_train[0].shape

(48,)

# 3. Select and Train Models

In [11]:
#@title Evaluate score
def evaluate(selector, filepath=None):
    print("Best F1:", selector.best_score_)
    print("Best param:", selector.best_params_)
    if filepath is not None:
        joblib.dump(selector.best_estimator_, filepath)

In [12]:
#@title Make scoring
multi_f1 = make_scorer(multioutput_f1, greater_is_better=True)

## 3.1. Support Vector Machines

In [13]:
#@title Support Vector Machine
param_distribs = {
    "estimator__C": uniform(loc=1e-2, scale=10),
    "estimator__kernel": ["linear", "rbf", "sigmoid"],
    "estimator__gamma": ["scale", "auto"],
    "estimator__random_state": randint(low=1, high=43),
}
svc_selector = RandomizedSearchCV(
    MultiOutputClassifier(SVC(), n_jobs=-1),
    param_distributions=param_distribs,
    n_iter=100, 
    cv=5,
    error_score="raise", 
    random_state=42,
    scoring=multi_f1,
    verbose=2
)
svc_selector.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END estimator__C=3.7554011884736247, estimator__gamma=scale, estimator__kernel=sigmoid, estimator__random_state=8; total time=   3.3s
[CV] END estimator__C=3.7554011884736247, estimator__gamma=scale, estimator__kernel=sigmoid, estimator__random_state=8; total time=   2.0s
[CV] END estimator__C=3.7554011884736247, estimator__gamma=scale, estimator__kernel=sigmoid, estimator__random_state=8; total time=   2.0s
[CV] END estimator__C=3.7554011884736247, estimator__gamma=scale, estimator__kernel=sigmoid, estimator__random_state=8; total time=   1.9s
[CV] END estimator__C=3.7554011884736247, estimator__gamma=scale, estimator__kernel=sigmoid, estimator__random_state=8; total time=   1.9s
[CV] END estimator__C=5.996584841970366, estimator__gamma=scale, estimator__kernel=rbf, estimator__random_state=19; total time=   4.3s
[CV] END estimator__C=5.996584841970366, estimator__gamma=scale, estimator__kernel=rbf, estimator__random_s

In [14]:
#@title Evaluate & save the best model
evaluate(svc_selector, filepath="svc.pkl")

Best F1: 0.5475247348588974
Best param: {'estimator__C': 2.6620236768172543, 'estimator__gamma': 'auto', 'estimator__kernel': 'rbf', 'estimator__random_state': 38}


## 3.2. Random Forest

In [15]:
#@title Random Forest
param_distribs = {
    "estimator__n_estimators": randint(low=10, high=500),
    "estimator__criterion": ["gini", "entropy"],
    "estimator__max_depth": randint(low=1, high=101),
    "estimator__min_samples_split": randint(low=2, high=50),
    "estimator__max_features": [None, "sqrt", "log2"],
    "estimator__max_leaf_nodes": randint(low=2, high=200),
    "estimator__min_impurity_decrease": uniform(loc=1e-6, scale=0.5),
    "estimator__max_samples": uniform(loc=0.2, scale=0.8),
    "estimator__random_state": randint(low=1, high=43),
}
rnd_selector = RandomizedSearchCV(
    MultiOutputClassifier(RandomForestClassifier(), n_jobs=-1),
    param_distributions=param_distribs,
    n_iter=100, 
    cv=5,
    error_score="raise", 
    random_state=42,
    scoring=multi_f1,
    verbose=2
)
rnd_selector.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END estimator__criterion=gini, estimator__max_depth=52, estimator__max_features=None, estimator__max_leaf_nodes=16, estimator__max_samples=0.7855951534491241, estimator__min_impurity_decrease=0.2993302420985183, estimator__min_samples_split=40, estimator__n_estimators=131, estimator__random_state=19; total time=   9.4s
[CV] END estimator__criterion=gini, estimator__max_depth=52, estimator__max_features=None, estimator__max_leaf_nodes=16, estimator__max_samples=0.7855951534491241, estimator__min_impurity_decrease=0.2993302420985183, estimator__min_samples_split=40, estimator__n_estimators=131, estimator__random_state=19; total time=   9.7s
[CV] END estimator__criterion=gini, estimator__max_depth=52, estimator__max_features=None, estimator__max_leaf_nodes=16, estimator__max_samples=0.7855951534491241, estimator__min_impurity_decrease=0.2993302420985183, estimator__min_samples_split=40, estimator__n_estimators=131, estima

In [16]:
#@title Evaluate & save the best model
evaluate(rnd_selector, filepath="rnd.pkl")

Best F1: 0.5235041992304744
Best param: {'estimator__criterion': 'entropy', 'estimator__max_depth': 75, 'estimator__max_features': 'log2', 'estimator__max_leaf_nodes': 104, 'estimator__max_samples': 0.3090971804694158, 'estimator__min_impurity_decrease': 0.0072733328339409644, 'estimator__min_samples_split': 39, 'estimator__n_estimators': 315, 'estimator__random_state': 34}
