In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
import pandas as pd
from sklearn.model_selection import train_test_split

# 🚀 Kích hoạt GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# 🚀 Bật Mixed Precision để tăng tốc
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# 🚀 Kích hoạt XLA compiler để tăng hiệu suất
tf.config.optimizer.set_jit(True)

# Load PhoBERT tokenizer và model
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
phobert.trainable = False  # ⚡ Đóng băng PhoBERT

# Preprocess dữ liệu
def preprocess_data(data):
    texts = data['Content'].tolist()
    labels = data['Label'].tolist()

    # Tokenize text với PhoBERT
    inputs = tokenizer(texts, padding='max_length', truncation=True, max_length=256, return_tensors='tf')

    return tf.convert_to_tensor(inputs['input_ids'], dtype=tf.int32), \
        tf.convert_to_tensor(inputs['attention_mask'], dtype=tf.int32), \
        tf.convert_to_tensor(labels, dtype=tf.float32)


# Define mô hình classification dựa trên PhoBERT
def build_model():
    input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='attention_mask')

    # Encoder output
    phobert_output = phobert(input_ids, attention_mask=attention_mask)[0]

    # ⚡ Tối ưu: Lấy embedding từ token đầu tiên [CLS] thay vì trung bình toàn bộ hidden state
    text_embedding = phobert_output[:, 0, :]

    dropout = tf.keras.layers.Dropout(0.2)(text_embedding)  # ⚡ Tăng dropout để tránh overfitting
    output = tf.keras.layers.Dense(1, activation='sigmoid', dtype=tf.float32)(dropout)  # ⚡ Mixed Precision fix

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


# Load data
real_news = pd.read_csv('./data/vnexpress_dataset.csv')
fake_news = pd.read_csv('./data/vnexpress_fake_dataset.csv')

# Gán nhãn
real_news['Label'] = 0
fake_news['Label'] = 1
data = pd.concat([real_news, fake_news], ignore_index=True)

# Chia thành train (70%), validation (15%) và test (15%)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Content'], data['Label'], test_size=0.3, random_state=42, stratify=data['Label']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42, stratify=test_labels
)

# Tokenize dữ liệu
train_inputs, train_mask, train_labels = preprocess_data(pd.DataFrame({'Content': train_texts, 'Label': train_labels}))
val_inputs, val_mask, val_labels = preprocess_data(pd.DataFrame({'Content': val_texts, 'Label': val_labels}))
test_inputs, test_mask, test_labels = preprocess_data(pd.DataFrame({'Content': test_texts, 'Label': test_labels}))

# ⚡ Tránh lỗi dtype không khớp bằng cách chuyển sang `int32`
train_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': train_inputs, 'attention_mask': train_mask}, train_labels)) \
    .batch(16).prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': val_inputs, 'attention_mask': val_mask}, val_labels)) \
    .batch(16).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': test_inputs, 'attention_mask': test_mask}, test_labels)) \
    .batch(16).prefetch(tf.data.experimental.AUTOTUNE)

# Train model
model = build_model()
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Đánh giá mô hình trên tập test
test_loss, test_acc = model.evaluate(test_dataset)
print(f'Test Accuracy: {test_acc:.4f}')


The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


Some layers from the model checkpoint at vinai/phobert-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/phobert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3
 14/146 [=>............................] - ETA: 28:30 - loss: 0.7970 - accuracy: 0.4464