In [1]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

In [2]:
os.getcwd()

'd:\\subject\\nlp\\code\\natural_language_processing\\notebook'

In [59]:
origin_path = r"../data/hwu/hwu"
# Dữ liệu có thể được phân tách bằng tab và không có header
df_train = pd.read_csv(os.path.join(origin_path, 'train.csv'), sep=',')
df_val = pd.read_csv(os.path.join(origin_path, 'val.csv'), sep=',')
df_test = pd.read_csv(os.path.join(origin_path,'test.csv'), sep=',')
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (8954, 2)
Validation shape: (1076, 2)
Test shape: (1076, 2)


Unnamed: 0,text,category
0,what alarms do i have set right now,alarm_query
1,checkout today alarm of meeting,alarm_query
2,report alarm settings,alarm_query
3,see see for me the alarms that you have set to...,alarm_query
4,is there an alarm for ten am,alarm_query


In [60]:
le = LabelEncoder()
le.fit(df_train['category'])
df_train['label'] = le.transform(df_train['category'])
df_val['label'] = le.transform(df_val['category'])
df_test['label'] = le.transform(df_test['category'])

In [61]:
df_train.head()

Unnamed: 0,text,category,label
0,what alarms do i have set right now,alarm_query,0
1,checkout today alarm of meeting,alarm_query,0
2,report alarm settings,alarm_query,0
3,see see for me the alarms that you have set to...,alarm_query,0
4,is there an alarm for ten am,alarm_query,0


# TF-IDF + Linear Regression

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)

tfidf_lr_pipeline.fit(df_train['text'], df_train['label'])
y_pred = tfidf_lr_pipeline.predict(df_test['text'])

print(classification_report(df_test['label'], y_pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.77      0.89      0.83        19
           3       1.00      0.75      0.86         8
           4       0.92      0.80      0.86        15
           5       0.93      1.00      0.96        13
           6       0.45      0.53      0.49        19
           7       0.89      0.89      0.89        19
           8       0.87      0.68      0.76        19
           9       0.59      0.68      0.63        19
          10       0.67      0.75      0.71         8
          11       0.74      0.89      0.81        19
          12       0.78      0.88      0.82         8
          13       0.83      0.79      0.81        19
          14       0.92      0.63      0.75        19
          15       0.81      0.89      0.85        19
          16       1.00      1.00      1.00        19
          17       1.00    

# Word2Vec + Dense

In [6]:
!pip install gensim



In [62]:
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [63]:
# 1. Huấn luyện mô hình Word2Vec trên dữ liệu text của bạn
sentences = [text.split() for text in df_train['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, epochs=100)

In [64]:
w2v_model.wv.most_similar("alarm")

[('alarms', 0.6348198056221008),
 ('appointment', 0.5341762900352478),
 ('appointments', 0.4859957695007324),
 ('event', 0.46934521198272705),
 ('item', 0.454486221075058),
 ('friday', 0.44339945912361145),
 ('wednesday', 0.44109663367271423),
 ('meetings', 0.44088345766067505),
 ('thursday', 0.4191473424434662),
 ('meeting', 0.4187237620353699)]

In [65]:
le.classes_

array(['alarm_query', 'alarm_remove', 'alarm_set', 'audio_volume_down',
       'audio_volume_mute', 'audio_volume_up', 'calendar_query',
       'calendar_remove', 'calendar_set', 'cooking_recipe',
       'datetime_convert', 'datetime_query', 'email_addcontact',
       'email_query', 'email_querycontact', 'email_sendemail',
       'general_affirm', 'general_commandstop', 'general_confirm',
       'general_dontcare', 'general_explain', 'general_joke',
       'general_negate', 'general_praise', 'general_quirky',
       'general_repeat', 'iot_cleaning', 'iot_coffee',
       'iot_hue_lightchange', 'iot_hue_lightdim', 'iot_hue_lightoff',
       'iot_hue_lighton', 'iot_hue_lightup', 'iot_wemo_off',
       'iot_wemo_on', 'lists_createoradd', 'lists_query', 'lists_remove',
       'music_likeness', 'music_query', 'music_settings', 'news_query',
       'play_audiobook', 'play_game', 'play_music', 'play_podcasts',
       'play_radio', 'qa_currency', 'qa_definition', 'qa_factoid',
       'qa_maths'

In [21]:
def sentence_to_avg_vector(text, model):
    tokens = text.split()
    vectors = [model.wv[token] for token in tokens if token in model.wv]

    if len(vectors) == 0:
        # Trả về vector zero với đúng kích thước embedding
        return np.zeros(model.vector_size, dtype='float32')

    return np.mean(vectors, axis=0)

def transform_to_vector(df, model):
    # df = df.sample(frac=1, random_state=42)
    texts = df['text'].to_list()
    labels = df['label'].to_list()

    vectors = np.array(
        [sentence_to_avg_vector(text, model) for text in texts],
        dtype='float32'
    )
    return vectors, np.array(labels)

# 3. Tạo dữ liệu train/val/test X_train_avg, X_val_avg, X_test_avg
X_train_avg, y_train = transform_to_vector(df_train, w2v_model)
X_test_avg, y_test = transform_to_vector(df_test, w2v_model)
X_val_avg, y_val = transform_to_vector(df_val, w2v_model)

# 4. Xây dựng mô hình Sequential của Keras
num_classes = len(le.classes_)
model = Sequential([
    Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
y_val

array([ 0,  0,  0, ..., 63, 63, 63], shape=(1076,))

In [23]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(
    X_train_avg,
    y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val_avg, y_val)
)

Epoch 1/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3286 - loss: 2.7959 - val_accuracy: 0.6961 - val_loss: 1.4281
Epoch 2/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6021 - loss: 1.4676 - val_accuracy: 0.7463 - val_loss: 0.9984
Epoch 3/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6616 - loss: 1.2104 - val_accuracy: 0.7770 - val_loss: 0.8687
Epoch 4/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7027 - loss: 1.0644 - val_accuracy: 0.7797 - val_loss: 0.7952
Epoch 5/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7258 - loss: 0.9659 - val_accuracy: 0.7853 - val_loss: 0.7520
Epoch 6/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7411 - loss: 0.9180 - val_accuracy: 0.7909 - val_loss: 0.7270
Epoch 7/50
[1m280/280[0m 

<keras.src.callbacks.history.History at 0x1e0a0cb2f90>

In [15]:
loss, acc = model.evaluate(X_test_avg, y_test)
print(f"Test accuracy: {acc:.4f}")

y_pred = model.predict(X_test_avg)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8253 - loss: 0.6472 
Test accuracy: 0.8253
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
              precision    recall  f1-score   support

           0       0.94      0.89      0.92        19
           1       0.83      0.91      0.87        11
           2       0.71      0.79      0.75        19
           3       1.00      0.50      0.67         8
           4       0.80      0.80      0.80        15
           5       0.68      1.00      0.81        13
           6       0.45      0.53      0.49        19
           7       1.00      0.89      0.94        19
           8       0.81      0.68      0.74        19
           9       0.81      0.68      0.74        19
          10       0.88      0.88      0.88         8
          11       0.83      0.79      0.81        19
          12       0.89      1.00      0.94         8
          13       0.83      0.79      

## 3. Embedding Pre-trained + LSTM

In [57]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras.callbacks import EarlyStopping

In [28]:
embedding_matrix.shape

(4265, 100)

In [66]:
# 1. Tiền xử lý cho mô hình chuỗi
# a. Tokenizer: Tạo vocab và chuyển text thành chuỗi chỉ số
num_classes = len(le.classes_)

all_sentences = df_train['text'].to_list()


tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(all_sentences)
train_sequences = tokenizer.texts_to_sequences(all_sentences)
test_sequences = tokenizer.texts_to_sequences(df_test['text'].to_list())
val_sequences = tokenizer.texts_to_sequences(df_val['text'].to_list())
# print("Train sequences:", train_sequences)


# b. Padding: Đảm bảo các chuỗi có cùng độ dài
max_len = 50
X_train_pad = pad_sequences(train_sequences, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(test_sequences, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(val_sequences, maxlen=max_len, padding='post')


# 2. Tạo ma trận trọng số cho Embedding Layer từ Word2Vec
vocab_size = len(tokenizer.word_index) + 1

embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# print("Embedding matrix:", embedding_matrix)


# 3. Xây dựng mô hình Sequential với LSTM
lstm_model_pretrained = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix], # Khởi tạo trọng số
        trainable=False # Đóng băng lớp Embedding
    ),
    LSTM(64, dropout=0.2),
    Dense(num_classes, activation='softmax')
])


# 4. Compile, huấn luyện (sử earlystopping)
lstm_model_pretrained.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(
    monitor='val_loss',  # Metric to monitor (e.g., validation loss)
    min_delta=0.001,     # Minimum change in the monitored metric to qualify as an improvement
    patience=50,         # Number of epochs with no improvement after which training will be stopped
    verbose=1,           # Verbosity mode (0 for silent, 1 for updates)
    mode='min',          # 'min' for metrics that should decrease (like loss), 'max' for metrics that should increase (like accuracy)
    restore_best_weights=True # Restores model weights from the epoch with the best value of the monitored metric
)

lstm_model_pretrained.fit(
    X_train_pad,
    y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val_pad, y_val),
    shuffle=True,
    callbacks=[early_stopping]
)

Epoch 1/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.0280 - loss: 3.9252 - val_accuracy: 0.0390 - val_loss: 3.6831
Epoch 2/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.0461 - loss: 3.6377 - val_accuracy: 0.0678 - val_loss: 3.5212
Epoch 3/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.0634 - loss: 3.4734 - val_accuracy: 0.0716 - val_loss: 3.4097
Epoch 4/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.0834 - loss: 3.2781 - val_accuracy: 0.1348 - val_loss: 3.0812
Epoch 5/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.1087 - loss: 3.0756 - val_accuracy: 0.1106 - val_loss: 3.1166
Epoch 6/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.1256 - loss: 3.0465 - val_accuracy: 0.1533 - val_loss: 2.8414
Epoch 7/50
[1m280/280

<keras.src.callbacks.history.History at 0x1e0ad1e8bd0>

In [67]:
loss, acc = lstm_model_pretrained.evaluate(X_test_pad, y_test)

print(f"Test accuracy: {acc:.4f}")

y_pred = lstm_model_pretrained.predict(X_test_pad)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6757 - loss: 1.1718
Test accuracy: 0.6757
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
              precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.62      0.79      0.70        19
           3       0.00      0.00      0.00         8
           4       0.40      0.27      0.32        15
           5       0.40      0.77      0.53        13
           6       0.38      0.42      0.40        19
           7       0.73      0.58      0.65        19
           8       0.58      0.37      0.45        19
           9       0.77      0.53      0.62        19
          10       0.71      0.62      0.67         8
          11       0.62      0.79      0.70        19
          12       0.71      0.62      0.67         8
          13       0.67      0.63      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 4. Embedding học từ đầu + LSTM

In [48]:
df_train['text'].head()

0                  what alarms do i have set right now
1                      checkout today alarm of meeting
2                                report alarm settings
3    see see for me the alarms that you have set to...
4                         is there an alarm for ten am
Name: text, dtype: object

In [54]:
num_classes = len(le.classes_)

all_sentences = df_train['text'].to_list()


tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(all_sentences)
train_sequences = tokenizer.texts_to_sequences(all_sentences)
test_sequences = tokenizer.texts_to_sequences(df_test['text'].to_list())
val_sequences = tokenizer.texts_to_sequences(df_val['text'].to_list())
# print("Train sequences:", train_sequences)


# b. Padding: Đảm bảo các chuỗi có cùng độ dài
max_len = 50
X_train_pad = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')
X_val_pad = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')


# 2. Tạo ma trận trọng số cho Embedding Layer từ Word2Vec
vocab_size = len(tokenizer.word_index) + 1


# 4. Xây dựng mô hình
lstm_model_scratch = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=100
    ),
    LSTM(64, dropout=0.2),
    Dense(num_classes, activation='softmax')
])

lstm_model_scratch.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    min_delta=0.001,
    patience=50,
    verbose=1,
    mode='min',
    restore_best_weights=True
)

history = lstm_model_scratch.fit(
    X_train_pad, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val_pad, y_val),
    shuffle=True,
    callbacks=[early_stopping]
)

Epoch 1/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.0163 - loss: 4.1430 - val_accuracy: 0.0177 - val_loss: 4.1278
Epoch 2/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.0162 - loss: 4.1360 - val_accuracy: 0.0177 - val_loss: 4.1260
Epoch 3/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.0161 - loss: 4.1338 - val_accuracy: 0.0177 - val_loss: 4.1252
Epoch 4/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.0168 - loss: 4.1331 - val_accuracy: 0.0177 - val_loss: 4.1249
Epoch 5/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.0241 - loss: 4.0843 - val_accuracy: 0.0270 - val_loss: 4.0446
Epoch 6/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.0283 - loss: 4.0210 - val_accuracy: 0.0279 - val_loss: 4.0156
Epoch 7/50
[1m280/280

In [55]:
loss, acc = lstm_model_scratch.evaluate(X_test_pad, y_test)

print(f"Test accuracy: {acc:.4f}")

y_pred = lstm_model_scratch.predict(X_test_pad)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7686 - loss: 1.2057
Test accuracy: 0.7686
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
              precision    recall  f1-score   support

           0       0.94      0.89      0.92        19
           1       0.91      0.91      0.91        11
           2       0.71      0.89      0.79        19
           3       0.67      0.75      0.71         8
           4       0.80      0.80      0.80        15
           5       0.69      0.69      0.69        13
           6       0.48      0.58      0.52        19
           7       0.78      0.95      0.86        19
           8       0.61      0.58      0.59        19
           9       0.83      0.53      0.65        19
          10       0.62      0.62      0.62         8
          11       0.64      0.74      0.68        19
          12       0.67      1.00      0.80         8
          13       0.67      0.95      0.

# Evaluation


In [60]:
def transform_to_vector(texts, model):
    vectors = np.array(
        [sentence_to_avg_vector(text, model) for text in texts],
        dtype='float32'
    )
    return vectors

texts = [
    "can you remind me to not call my mom",
    "is it going to be sunny or rainy tomorrow",
    "find a flight from new york to london but not through paris"
]
labels = ['reminder_create', 'weather_query', 'flight_search']

print("Ground true:")
print(labels)


y_pred = tfidf_lr_pipeline.predict(texts)
label_pred = le.inverse_transform(y_pred)
print("Logistic Regression:")
print(label_pred)

y_pred = model.predict(transform_to_vector(texts, w2v_model))
y_pred = np.argmax(y_pred, axis=1)
label_pred = le.inverse_transform(y_pred)
print("Word2vec + Dense:")
print(label_pred)

max_len = max([len(text) for text in texts])
text_sequences = tokenizer.texts_to_sequences(texts)
texts_pad = pad_sequences(text_sequences, maxlen=max_len, padding='post')

y_pred = lstm_model_pretrained.predict(texts_pad)
y_pred = np.argmax(y_pred, axis=1)
label_pred = le.inverse_transform(y_pred)
print("Embedding (Pretrained) + LSTM:")
print(label_pred)


y_pred = lstm_model_scratch.predict(texts_pad)
y_pred = np.argmax(y_pred, axis=1)
label_pred = le.inverse_transform(y_pred)
print("Embedding (Scratch) + LSTM:")
print(label_pred)



Ground true:
['reminder_create', 'weather_query', 'flight_search']
Logistic Regression:
['calendar_set' 'weather_query' 'general_negate']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Word2vec + Dense:
['email_query' 'weather_query' 'email_sendemail']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Embedding (Pretrained) + LSTM:
['takeaway_query' 'weather_query' 'social_post']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Embedding (Scratch) + LSTM:
['alarm_set' 'alarm_set' 'alarm_set']
