In [1]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

In [2]:
origin_path = r""
# Dữ liệu có thể được phân tách bằng tab và không có header
df_train = pd.read_csv(os.path.join(origin_path, 'train.csv'), sep=',')
df_val = pd.read_csv(os.path.join(origin_path, 'val.csv'), sep=',')
df_test = pd.read_csv(os.path.join(origin_path,'test.csv'), sep=',')
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (8954, 2)
Validation shape: (1076, 2)
Test shape: (1076, 2)


Unnamed: 0,text,category
0,what alarms do i have set right now,alarm_query
1,checkout today alarm of meeting,alarm_query
2,report alarm settings,alarm_query
3,see see for me the alarms that you have set to...,alarm_query
4,is there an alarm for ten am,alarm_query


In [3]:
le = LabelEncoder()
le.fit(df_train['category'])
df_train['label'] = le.transform(df_train['category'])
df_val['label'] = le.transform(df_val['category'])
df_test['label'] = le.transform(df_test['category'])

In [4]:
df_train.head()

Unnamed: 0,text,category,label
0,what alarms do i have set right now,alarm_query,0
1,checkout today alarm of meeting,alarm_query,0
2,report alarm settings,alarm_query,0
3,see see for me the alarms that you have set to...,alarm_query,0
4,is there an alarm for ten am,alarm_query,0


# TF-IDF + Linear Regression

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)

tfidf_lr_pipeline.fit(df_train['text'], df_train['label'])
y_pred = tfidf_lr_pipeline.predict(df_test['text'])

print(classification_report(df_test['label'], y_pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.77      0.89      0.83        19
           3       1.00      0.75      0.86         8
           4       0.92      0.80      0.86        15
           5       0.93      1.00      0.96        13
           6       0.45      0.53      0.49        19
           7       0.89      0.89      0.89        19
           8       0.87      0.68      0.76        19
           9       0.59      0.68      0.63        19
          10       0.67      0.75      0.71         8
          11       0.74      0.89      0.81        19
          12       0.78      0.88      0.82         8
          13       0.83      0.79      0.81        19
          14       0.92      0.63      0.75        19
          15       0.81      0.89      0.85        19
          16       1.00      1.00      1.00        19
          17       1.00    

# Word2Vec + Dense

In [6]:
!pip install gensim



In [7]:
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [8]:
# 1. Huấn luyện mô hình Word2Vec trên dữ liệu text của bạn
sentences = [text.split() for text in df_train['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [9]:
w2v_model.wv.most_similar("alarm")

[('tomorrow', 0.996619701385498),
 ('six', 0.9950720071792603),
 ('at', 0.9950494766235352),
 ('eight', 0.9947173595428467),
 ('am', 0.9946451187133789),
 ('pm', 0.9944387078285217),
 ('seven', 0.9938293099403381),
 ('five', 0.993751585483551),
 ('morning', 0.9935374855995178),
 ('set', 0.9933524131774902)]

In [10]:
le.classes_

array(['alarm_query', 'alarm_remove', 'alarm_set', 'audio_volume_down',
       'audio_volume_mute', 'audio_volume_up', 'calendar_query',
       'calendar_remove', 'calendar_set', 'cooking_recipe',
       'datetime_convert', 'datetime_query', 'email_addcontact',
       'email_query', 'email_querycontact', 'email_sendemail',
       'general_affirm', 'general_commandstop', 'general_confirm',
       'general_dontcare', 'general_explain', 'general_joke',
       'general_negate', 'general_praise', 'general_quirky',
       'general_repeat', 'iot_cleaning', 'iot_coffee',
       'iot_hue_lightchange', 'iot_hue_lightdim', 'iot_hue_lightoff',
       'iot_hue_lighton', 'iot_hue_lightup', 'iot_wemo_off',
       'iot_wemo_on', 'lists_createoradd', 'lists_query', 'lists_remove',
       'music_likeness', 'music_query', 'music_settings', 'news_query',
       'play_audiobook', 'play_game', 'play_music', 'play_podcasts',
       'play_radio', 'qa_currency', 'qa_definition', 'qa_factoid',
       'qa_maths'

In [24]:
def sentence_to_avg_vector(text, model):
    tokens = text.split()
    vectors = [model.wv[token] for token in tokens if token in model.wv]

    if len(vectors) == 0:
        # Trả về vector zero với đúng kích thước embedding
        return np.zeros(model.vector_size, dtype='float32')

    return np.mean(vectors, axis=0)

def transform_to_vector(df, model):
    # df = df.sample(frac=1, random_state=42)
    texts = df['text'].to_list()
    labels = df['label'].to_list()

    vectors = np.array(
        [sentence_to_avg_vector(text, model) for text in texts],
        dtype='float32'
    )
    return vectors, np.array(labels)

# 3. Tạo dữ liệu train/val/test X_train_avg, X_val_avg, X_test_avg
X_train_avg, y_train = transform_to_vector(df_train, w2v_model)
X_test_avg, y_test = transform_to_vector(df_test, w2v_model)
X_val_avg, y_val = transform_to_vector(df_val, w2v_model)

# 4. Xây dựng mô hình Sequential của Keras
num_classes = len(le.classes_)
model = Sequential([
    Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [25]:
y_val

array([ 0,  0,  0, ..., 63, 63, 63])

In [26]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(
    X_train_avg,
    y_train,
    epochs=200,
    batch_size=32,
    validation_data=(X_val_avg, y_val)
)

Epoch 1/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.0167 - loss: 4.1658 - val_accuracy: 0.0344 - val_loss: 4.1133
Epoch 2/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.0336 - loss: 4.1150 - val_accuracy: 0.0558 - val_loss: 4.0694
Epoch 3/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0451 - loss: 4.0668 - val_accuracy: 0.0771 - val_loss: 3.9793
Epoch 4/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0536 - loss: 3.9746 - val_accuracy: 0.0855 - val_loss: 3.8597
Epoch 5/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0677 - loss: 3.8684 - val_accuracy: 0.0809 - val_loss: 3.7469
Epoch 6/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0756 - loss: 3.7752 - val_accuracy: 0.1134 - val_loss: 3.6614
Epoch 7/200
[1m280/2

<keras.src.callbacks.history.History at 0x7eef7e13f980>

In [27]:
loss, acc = model.evaluate(X_test_avg, y_test)
print(f"Test accuracy: {acc:.4f}")

y_pred = model.predict(X_test_avg)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4555 - loss: 2.0383
Test accuracy: 0.4126
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
              precision    recall  f1-score   support

           0       0.42      0.58      0.49        19
           1       0.60      0.27      0.38        11
           2       0.57      0.89      0.69        19
           3       0.60      0.38      0.46         8
           4       0.29      0.13      0.18        15
           5       0.86      0.46      0.60        13
           6       0.11      0.05      0.07        19
           7       0.33      0.53      0.41        19
           8       0.19      0.26      0.22        19
           9       0.15      0.16      0.15        19
          10       0.00      0.00      0.00         8
          11       0.46      0.63      0.53        19
          12       0.50      0.62      0.56         8
          13       0.29      0.42      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 3. Embedding Pre-trained + LSTM

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras.callbacks import EarlyStopping

In [29]:
# 1. Tiền xử lý cho mô hình chuỗi
# a. Tokenizer: Tạo vocab và chuyển text thành chuỗi chỉ số
all_sentences = df_train['text'].to_list()

tokens = set()
for sentence in all_sentences:
    tokens.update(sentence.split())

vocab_size = len(tokens)
# print("Vocab size", vocab_size)

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
tokenizer.fit_on_texts(all_sentences)
train_sequences = tokenizer.texts_to_sequences(all_sentences)
test_sequences = tokenizer.texts_to_sequences(df_test['text'].to_list())
val_sequences = tokenizer.texts_to_sequences(df_val['text'].to_list())
# print("Train sequences:", train_sequences)


# b. Padding: Đảm bảo các chuỗi có cùng độ dài
max_len = 50
X_train_pad = pad_sequences(train_sequences, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(test_sequences, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(val_sequences, maxlen=max_len, padding='post')


# 2. Tạo ma trận trọng số cho Embedding Layer từ Word2Vec
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# print("Embedding matrix:", embedding_matrix)


# 3. Xây dựng mô hình Sequential với LSTM
lstm_model_pretrained = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix], # Khởi tạo trọng số
        input_length=max_len,
        trainable=False # Đóng băng lớp Embedding
    ),
    LSTM(128, dropout=0.2),
    Dense(num_classes, activation='softmax')
])


# 4. Compile, huấn luyện (sử earlystopping)
lstm_model_pretrained.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(
    monitor='val_loss',  # Metric to monitor (e.g., validation loss)
    min_delta=0.001,     # Minimum change in the monitored metric to qualify as an improvement
    patience=50,         # Number of epochs with no improvement after which training will be stopped
    verbose=1,           # Verbosity mode (0 for silent, 1 for updates)
    mode='min',          # 'min' for metrics that should decrease (like loss), 'max' for metrics that should increase (like accuracy)
    restore_best_weights=True # Restores model weights from the epoch with the best value of the monitored metric
)

lstm_model_pretrained.fit(
    X_train_pad,
    y_train,
    epochs=200,
    batch_size=32,
    validation_data=(X_val_pad, y_val),
    callbacks=[early_stopping]
)

Epoch 1/200




[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0161 - loss: 4.1498 - val_accuracy: 0.0242 - val_loss: 4.1136
Epoch 2/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0282 - loss: 4.0642 - val_accuracy: 0.0455 - val_loss: 3.9111
Epoch 3/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0445 - loss: 3.9196 - val_accuracy: 0.0660 - val_loss: 3.7704
Epoch 4/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0495 - loss: 3.8314 - val_accuracy: 0.0623 - val_loss: 3.7439
Epoch 5/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.0599 - loss: 3.7887 - val_accuracy: 0.0771 - val_loss: 3.6909
Epoch 6/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.0671 - loss: 3.7252 - val_accuracy: 0.0911 - val_loss: 3.5703
Epoch 7/200
[1m280/280[0m [32m━

<keras.src.callbacks.history.History at 0x7eef6ffe2270>

In [31]:
loss, acc = lstm_model_pretrained.evaluate(X_test_pad, y_test)

print(f"Test accuracy: {acc:.4f}")

y_pred = lstm_model_pretrained.predict(X_test_pad)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5478 - loss: 1.8702
Test accuracy: 0.4591
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
              precision    recall  f1-score   support

           0       0.85      0.89      0.87        19
           1       0.89      0.73      0.80        11
           2       0.74      0.89      0.81        19
           3       0.60      0.38      0.46         8
           4       0.40      0.40      0.40        15
           5       0.62      0.62      0.62        13
           6       0.36      0.21      0.27        19
           7       0.41      0.63      0.50        19
           8       0.30      0.37      0.33        19
           9       0.29      0.21      0.24        19
          10       0.62      0.62      0.62         8
          11       0.63      0.63      0.63        19
          12       0.33      0.62      0.43         8
          13       0.30      0.42      0.

## 4. Embedding học từ đầu + LSTM

In [32]:
all_sentences = df_train['text'].to_list()

tokens = set()
for sentence in all_sentences:
    tokens.update(sentence.split())

vocab_size = len(tokens)
max_len = 50

In [48]:
X_train_pad

array([[   9,   99,   24, ...,    0,    0,    0],
       [ 809,   39,   36, ...,    0,    0,    0],
       [ 606,   36,  532, ...,    0,    0,    0],
       ...,
       [  44,    5, 1519, ...,    0,    0,    0],
       [ 202,    5,  386, ...,    0,    0,    0],
       [   9,    6,    2, ...,    0,    0,    0]], dtype=int32)

In [54]:
# 4. Xây dựng mô hình
lstm_model_scratch = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=100,
        input_length=max_len
    ),
    LSTM(128, dropout=0.2),
    Dense(num_classes, activation='softmax')
])

lstm_model_scratch.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    min_delta=0.001,
    patience=50,
    verbose=1,
    mode='min',
    restore_best_weights=True
)

history = lstm_model_scratch.fit(
    X_train_pad, y_train,
    epochs=200,
    batch_size=32,
    validation_data=(X_val_pad, y_val),
    callbacks=[early_stopping]
)

Epoch 1/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.0143 - loss: 4.1492 - val_accuracy: 0.0177 - val_loss: 4.1295
Epoch 2/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0187 - loss: 4.1363 - val_accuracy: 0.0177 - val_loss: 4.1277
Epoch 3/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.0153 - loss: 4.1363 - val_accuracy: 0.0177 - val_loss: 4.1258
Epoch 4/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0160 - loss: 4.1305 - val_accuracy: 0.0177 - val_loss: 4.1253
Epoch 5/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0160 - loss: 4.1370 - val_accuracy: 0.0177 - val_loss: 4.1247
Epoch 6/200
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0175 - loss: 4.1313 - val_accuracy: 0.0177 - val_loss: 4.1248
Epoch 7/200
[1m280/2

In [55]:
loss, acc = lstm_model_scratch.evaluate(X_test_pad, y_test)

print(f"Test accuracy: {acc:.4f}")

y_pred = lstm_model_scratch.predict(X_test_pad)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0552 - loss: 4.1512
Test accuracy: 0.0177
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.00      0.00      0.00        11
           2       0.02      1.00      0.03        19
           3       0.00      0.00      0.00         8
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        19
           7       0.00      0.00      0.00        19
           8       0.00      0.00      0.00        19
           9       0.00      0.00      0.00        19
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00        19
          12       0.00      0.00      0.00         8
          13       0.00      0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Evaluation


In [60]:
def transform_to_vector(texts, model):
    vectors = np.array(
        [sentence_to_avg_vector(text, model) for text in texts],
        dtype='float32'
    )
    return vectors

texts = [
    "can you remind me to not call my mom",
    "is it going to be sunny or rainy tomorrow",
    "find a flight from new york to london but not through paris"
]
labels = ['reminder_create', 'weather_query', 'flight_search']

print("Ground true:")
print(labels)


y_pred = tfidf_lr_pipeline.predict(texts)
label_pred = le.inverse_transform(y_pred)
print("Logistic Regression:")
print(label_pred)

y_pred = model.predict(transform_to_vector(texts, w2v_model))
y_pred = np.argmax(y_pred, axis=1)
label_pred = le.inverse_transform(y_pred)
print("Word2vec + Dense:")
print(label_pred)

max_len = max([len(text) for text in texts])
text_sequences = tokenizer.texts_to_sequences(texts)
texts_pad = pad_sequences(text_sequences, maxlen=max_len, padding='post')

y_pred = lstm_model_pretrained.predict(texts_pad)
y_pred = np.argmax(y_pred, axis=1)
label_pred = le.inverse_transform(y_pred)
print("Embedding (Pretrained) + LSTM:")
print(label_pred)


y_pred = lstm_model_scratch.predict(texts_pad)
y_pred = np.argmax(y_pred, axis=1)
label_pred = le.inverse_transform(y_pred)
print("Embedding (Scratch) + LSTM:")
print(label_pred)



Ground true:
['reminder_create', 'weather_query', 'flight_search']
Logistic Regression:
['calendar_set' 'weather_query' 'general_negate']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Word2vec + Dense:
['email_query' 'weather_query' 'email_sendemail']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Embedding (Pretrained) + LSTM:
['takeaway_query' 'weather_query' 'social_post']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Embedding (Scratch) + LSTM:
['alarm_set' 'alarm_set' 'alarm_set']
