In [1]:
import pandas as pd
import os

data_path = r'/home/manh/code/nlp/src/data/hwu'
train_path = os.path.join(data_path, 'train.csv')
val_path =  os.path.join(data_path, 'val.csv')
test_path = os.path.join(data_path, 'test.csv')

# Dữ liệu có thể được phân tách bằng tab và không có header
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (8954, 2)
Validation shape: (1076, 2)
Test shape: (1076, 2)


Unnamed: 0,text,category
0,what alarms do i have set right now,alarm_query
1,checkout today alarm of meeting,alarm_query
2,report alarm settings,alarm_query
3,see see for me the alarms that you have set to...,alarm_query
4,is there an alarm for ten am,alarm_query


Encoding label

In [2]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_train['label_encoded'] = label_encoder.fit_transform(df_train['category'])
df_val['label_encoded'] = label_encoder.transform(df_val['category'])
df_test['label_encoded'] = label_encoder.transform(df_test['category'])

num_classes = len(label_encoder.classes_)
print("Number of classes:", num_classes)

Number of classes: 64


TFIDF + LOGISTIC 

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
# 1. Tạo một pipeline với TfidfVectorizer và LogisticRegression
tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)
# 2. Huấn luyện pipeline trên tập train
tfidf_lr_pipeline.fit(df_train['text'], df_train['label_encoded'])

# 3. Đánh giá trên tập test
y_pred = tfidf_lr_pipeline.predict(df_test['text'])
print(classification_report(df_test['label_encoded'], y_pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.81      0.89      0.85        19
           3       1.00      0.75      0.86         8
           4       0.92      0.80      0.86        15
           5       0.93      1.00      0.96        13
           6       0.45      0.53      0.49        19
           7       0.89      0.89      0.89        19
           8       0.81      0.68      0.74        19
           9       0.59      0.68      0.63        19
          10       0.67      0.75      0.71         8
          11       0.74      0.89      0.81        19
          12       0.78      0.88      0.82         8
          13       0.83      0.79      0.81        19
          14       0.92      0.63      0.75        19
          15       0.81      0.89      0.85        19
          16       1.00      1.00      1.00        19
          17       1.00    

WORD2VEC (AVG pooling) + DENSE 

In [8]:
!pip install tensorflow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from gensim.models import Word2Vec

# 1. Huấn luyện mô hình Word2Vec trên dữ liệu text của bạn
sentences = [text.split() for text in df_train['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, epochs=100)

# 2. Viết hàm để chuyển mỗi câu thành vector trung bình
def sentence_to_avg_vector(text, model):
    # ... (Implement logic)
    vectors = [model.wv[word] for word in text.split() if word in model.wv]
    avg_vector = np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    return avg_vector

# 3. Tạo dữ liệu train/val/test X_train_avg, X_val_avg, X_test_avg
X_train_avg = np.array([sentence_to_avg_vector(text, w2v_model) for text in df_train['text']])
X_val_avg = np.array([sentence_to_avg_vector(text, w2v_model) for text in df_val['text']])
X_test_avg = np.array([sentence_to_avg_vector(text, w2v_model) for text in df_test['text']])
y_train = df_train['label_encoded'].values
y_val = df_val['label_encoded'].values
y_test = df_test['label_encoded'].values

# 4. Xây dựng mô hình Sequential của Keras
model = Sequential([
    Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(
    X_train_avg, y_train,
    validation_data=(X_val_avg, y_val),
    epochs=100, batch_size=32,
    callbacks=[early_stopping]
)
y_pred = model.predict(X_test_avg)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes))

2025-12-06 11:34:43.539638: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-12-06 11:34:49.556922: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-12-06 11:34:49.560488: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install

Epoch 1/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3372 - loss: 2.7840 - val_accuracy: 0.6673 - val_loss: 1.4282
Epoch 2/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6004 - loss: 1.4746 - val_accuracy: 0.7565 - val_loss: 0.9804
Epoch 3/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6647 - loss: 1.1866 - val_accuracy: 0.7714 - val_loss: 0.8481
Epoch 4/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7015 - loss: 1.0497 - val_accuracy: 0.7900 - val_loss: 0.7819
Epoch 5/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7247 - loss: 0.9662 - val_accuracy: 0.7909 - val_loss: 0.7383
Epoch 6/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7381 - loss: 0.9061 - val_accuracy: 0.7974 - val_loss: 0.7085
Epoch 7/100
[1m280/28

In [13]:

# tính toán loss trên tập test
y_pred_proba = model.predict(X_test_avg)
log_loss = -np.mean(np.log(y_pred_proba[np.arange(len(y_test)), y_test]))
print(f"Log Loss on test set: {log_loss}")

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Log Loss on test set: 0.6759335398674011


# 3 Mô hình EMBEDDING pretrain(word2vec) + LSTM

In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM

vocab_size = len(w2v_model.wv.index_to_key) + 1
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
tokenizer.fit_on_texts(df_train['text'])
train_sequences = tokenizer.texts_to_sequences(df_train['text'])
# Padding: Đảm bảo các chuỗi có cùng độ dài
max_len = 100
X_train_pad = pad_sequences(train_sequences, maxlen=max_len, padding='post')
val_sequences = tokenizer.texts_to_sequences(df_val['text'])
X_val_pad = pad_sequences(val_sequences, maxlen=max_len, padding='post')
test_sequences = tokenizer.texts_to_sequences(df_test['text'])
X_test_pad = pad_sequences(test_sequences, maxlen=max_len, padding='post')

#  Tạo ma trận trọng số cho Embedding Layer từ Word2Vec
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
lstm_model_pretrained = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix], # Khởi tạo trọng số
        input_length=max_len,
        mask_zero=True,
        trainable=False # Đóng băng lớp Embedding
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
])



Compile, huấn luyện (sử dụng EarlyStopping) và đánh giá

In [16]:
lstm_model_pretrained.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = lstm_model_pretrained.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[early_stopping]
)

Epoch 1/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 127ms/step - accuracy: 0.5553 - loss: 2.0437 - val_accuracy: 0.7593 - val_loss: 0.9384
Epoch 2/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 110ms/step - accuracy: 0.7716 - loss: 0.8709 - val_accuracy: 0.8058 - val_loss: 0.6964
Epoch 3/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 103ms/step - accuracy: 0.8237 - loss: 0.6701 - val_accuracy: 0.8234 - val_loss: 0.6030
Epoch 4/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 100ms/step - accuracy: 0.8444 - loss: 0.5782 - val_accuracy: 0.8364 - val_loss: 0.5802
Epoch 5/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 99ms/step - accuracy: 0.8576 - loss: 0.5133 - val_accuracy: 0.8467 - val_loss: 0.5496
Epoch 6/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 102ms/step - accuracy: 0.8753 - loss: 0.4527 - val_accuracy: 0.8476 - val_loss: 0.5176
Epoch 7/20


In [17]:
from sklearn.metrics import classification_report
y_pred = lstm_model_pretrained.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        19
           1       0.79      1.00      0.88        11
           2       0.95      0.95      0.95        19
           3       0.88      0.88      0.88         8
           4       0.81      0.87      0.84        15
           5       0.86      0.92      0.89        13
           6       0.62      0.68      0.65        19
           7       1.00      0.95      0.97        19
           8       0.82      0.74      0.78        19
           9       0.85      0.58      0.69        19
          10       0.67      0.75      0.71         8
          11       0.74      0.89      0.81        19
          12       0.80      1.00      0.89         8
          13       0.89      0.84      0.86        19
          14       0.92      0.58      0.71        19
          15       0.86      0.95      0.90        19
       

# 4. embedding từ đâu + LSTM

In [18]:
lstm_model_scratch = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=100, # Chọn một chiều embedding, ví dụ 100
        input_length=max_len,
        mask_zero=True,
        # Không có weights, trainable=True (mặc định)
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
])

# 2. Compile, huấn luyện và đánh giá mô hình
lstm_model_scratch.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = lstm_model_scratch.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping]
)
y_pred = lstm_model_scratch.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes))

Epoch 1/100




[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 126ms/step - accuracy: 0.2704 - loss: 3.2282 - val_accuracy: 0.5809 - val_loss: 1.9130
Epoch 2/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 119ms/step - accuracy: 0.6979 - loss: 1.3560 - val_accuracy: 0.7918 - val_loss: 0.9213
Epoch 3/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 113ms/step - accuracy: 0.8405 - loss: 0.7101 - val_accuracy: 0.8401 - val_loss: 0.6986
Epoch 4/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 120ms/step - accuracy: 0.9017 - loss: 0.4431 - val_accuracy: 0.8606 - val_loss: 0.5782
Epoch 5/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 125ms/step - accuracy: 0.9344 - loss: 0.2975 - val_accuracy: 0.8606 - val_loss: 0.5338
Epoch 6/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 124ms/step - accuracy: 0.9539 - loss: 0.2117 - val_accuracy: 0.8690 - val_loss: 0.5349
Epoch 7/100
[1m

loss trên tập test 

In [19]:
y_pred_proba = lstm_model_scratch.predict(X_test_pad)
log_loss = -np.mean(np.log(y_pred_proba[np.arange(len(y_test)), y_test]))
print(f"Log Loss on test set: {log_loss}")

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step
Log Loss on test set: 0.5858633518218994
