In [None]:
from transformers import AutoTokenizer, TFAutoModel
import os
import time
import pandas as pd
from sklearn.model_selection import train_test_split

# Set CPU threading configuration BEFORE importing TensorFlow
num_physical_cores = os.cpu_count()
num_logical_cores = os.cpu_count()
if num_physical_cores is not None:
    # These environment variables need to be set before TensorFlow is imported
    os.environ["TF_NUM_INTEROP_THREADS"] = str(num_physical_cores)
    os.environ["TF_NUM_INTRAOP_THREADS"] = str(num_logical_cores)
    print(f"💥 CPU Threading configured: {num_physical_cores} physical cores, {num_logical_cores} logical cores")

# GPU diagnostic code - Add before TensorFlow import
import subprocess
import sys

def check_nvidia_gpu():
    try:
        nvidia_output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
        print("💥 NVIDIA-SMI Output:")
        print(nvidia_output)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("❌ nvidia-smi command failed - NVIDIA driver may not be properly installed")
        return False

def check_cuda_installation():
    try:
        nvcc_output = subprocess.check_output(['nvcc', '--version'], stderr=subprocess.STDOUT).decode('utf-8')
        print("💥 CUDA Compiler Version:")
        print(nvcc_output)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("❌ CUDA toolkit not found in PATH - CUDA may not be properly installed")
        return False

print("\n💥 CHECKING GPU PREREQUISITES:")
has_nvidia_driver = check_nvidia_gpu()
has_cuda = check_cuda_installation()

# Check Python and TensorFlow paths (to detect potential environment issues)
print(f"\n💥 Python executable: {sys.executable}")
print(f"💥 Python version: {sys.version}")

# Now import TensorFlow after setting thread configurations
import tensorflow as tf
print(f"\n💥 TensorFlow version: {tf.__version__}")
print(f"💥 TensorFlow built with CUDA: {tf.test.is_built_with_cuda()}")
print(f"💥 TensorFlow GPU available: {tf.test.is_gpu_available()}")

if has_nvidia_driver and not tf.test.is_gpu_available():
    print("\n❗ POTENTIAL ISSUE DETECTED:")
    print("   - NVIDIA GPU detected by system but not by TensorFlow")
    print("   - This may be caused by:")
    print("     1. Using CPU-only TensorFlow instead of GPU version")
    print("     2. Incompatible CUDA or cuDNN versions")
    print("     3. Environment configuration issues")
    print("\n💡 RECOMMENDATIONS:")
    print("   - Ensure you have installed tensorflow-gpu or tensorflow>=2.1 with pip")
    print("   - Check compatible CUDA/cuDNN versions for your TensorFlow version")
    print("   - Try: pip install tensorflow==2.10.0 (or another recent version)")
    print("   - For manual GPU setup, see: https://www.tensorflow.org/install/gpu")

# 🚀 **Kiểm tra và cấu hình CPU**
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth for all GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"💥 Detected {len(gpus)} GPU(s):")
        for i, gpu in enumerate(gpus):
            print(f"  GPU {i}: {gpu.name}")
        
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(f"💥 {len(logical_gpus)} Logical GPU(s) available")
        
        # 🚀 **Bật Mixed Precision cho GPU để tăng tốc**
        tf.keras.mixed_precision.set_global_policy("mixed_float16")
        print(f"💥 Mixed precision policy: {tf.keras.mixed_precision.global_policy()}")
        
        # 🚀 **Kích hoạt XLA compiler để tăng hiệu suất trên GPU**
        tf.config.optimizer.set_jit(True)
        print("💥 XLA JIT compilation enabled")
    except RuntimeError as e:
        print(f"❌ GPU error: {e}")
else:
    print("❌ No GPU detected by TensorFlow. Running on CPU.")
    # Configure for CPU - use float32 for better compatibility
    tf.keras.mixed_precision.set_global_policy("float32")
    print(f"💥 Mixed precision policy: {tf.keras.mixed_precision.global_policy()}")
    print(f"💥 CPU Optimization: Using {os.environ.get('TF_NUM_INTEROP_THREADS')} inter-op threads, {os.environ.get('TF_NUM_INTRAOP_THREADS')} intra-op threads")
        
    # Disable XLA which isn't needed for CPU
    tf.config.optimizer.set_jit(False)
    print("💥 XLA JIT compilation disabled for CPU")

# CPU memory monitor function
def cpu_memory_usage():
    try:
        import psutil
        process = psutil.Process(os.getpid())
        memory_info = process.memory_info()
        return f"Memory usage: {memory_info.rss / (1024 * 1024):.1f} MB"
    except:
        return "Memory monitoring not available"

# Resource monitoring function that works for both CPU and GPU
def resource_usage():
    if gpus:
        try:
            import subprocess
            result = subprocess.check_output(
                ['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,nounits,noheader']
            ).decode('utf-8')
            memory_usage = [tuple(map(int, x.split(','))) for x in result.strip().split('\n')]
            return [f"GPU {i}: {used} MB / {total} MB ({used/total:.1%})" for i, (used, total) in enumerate(memory_usage)]
        except:
            return ["GPU memory monitoring not available"]
    else:
        return [cpu_memory_usage()]

# 🔥 **Load PhoBERT tokenizer và model**
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
phobert.trainable = False  # ⚡ Đóng băng PhoBERT

# Print memory after loading model
print("\n💥 Memory after loading PhoBERT:")
print(resource_usage())

# 🚀 **Hàm tiền xử lý dữ liệu**
def preprocess_data(texts, labels):
    # Convert to list if they're Series objects
    if hasattr(texts, 'tolist'):
        texts = texts.tolist()
    if hasattr(labels, 'tolist'):
        labels = labels.tolist()

    # Clean the text data - ensure all are strings
    cleaned_texts = []
    cleaned_labels = []
    
    for i, (text, label) in enumerate(zip(texts, labels)):
        # Skip None or NaN values
        if text is None or (isinstance(text, float) and pd.isna(text)):
            print(f"Warning: Skipping item {i} with None/NaN text")
            continue
            
        # Convert to string if not already
        if not isinstance(text, str):
            text = str(text)
            
        cleaned_texts.append(text)
        cleaned_labels.append(label)
    
    if not cleaned_texts:
        raise ValueError("No valid text entries found after cleaning")
        
    # Tokenize văn bản với PhoBERT
    inputs = tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=256, return_tensors='tf')

    return tf.convert_to_tensor(inputs['input_ids'], dtype=tf.int32), \
           tf.convert_to_tensor(inputs['attention_mask'], dtype=tf.int32), \
           tf.convert_to_tensor(cleaned_labels, dtype=tf.float32)


# 🔥 **Tạo lớp Keras tùy chỉnh cho PhoBERT**
class CustomPhoBERTLayer(tf.keras.layers.Layer):
    def __init__(self, phobert_model, **kwargs):
        super(CustomPhoBERTLayer, self).__init__(**kwargs)
        self.phobert = phobert_model
        self.phobert_name = "vinai/phobert-base"  # Store the name for serialization

    def call(self, inputs):
        input_ids, attention_mask = inputs
        output = self.phobert(input_ids=input_ids, attention_mask=attention_mask)[0]
        return output
        
    def get_config(self):
        config = super(CustomPhoBERTLayer, self).get_config()
        config.update({"phobert_name": self.phobert_name})
        return config
        
    @classmethod
    def from_config(cls, config):
        # Load PhoBERT when reconstructing the layer
        from transformers import TFAutoModel
        config_copy = dict(config)
        phobert_name = config_copy.pop("phobert_name")
        phobert_model = TFAutoModel.from_pretrained(phobert_name)
        phobert_model.trainable = False
        return cls(phobert_model, **config_copy)


# 🔥 **Hàm xây dựng mô hình PhoBERT**
def build_model():
    input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='attention_mask')

    # ✅ **Dùng lớp CustomPhoBERTLayer thay vì Lambda**
    phobert_output = CustomPhoBERTLayer(phobert)([input_ids, attention_mask])

    # 📌 **Lấy embedding từ token đầu tiên [CLS]**
    text_embedding = tf.keras.layers.Lambda(lambda x: x[:, 0, :])(phobert_output)

    dropout = tf.keras.layers.Dropout(0.1)(text_embedding)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


# 🚀 **Load dữ liệu**
real_news = pd.read_csv('./data/vnexpress_dataset.csv')
fake_news = pd.read_csv('./data/vnexpress_fake_dataset.csv')

# Gán nhãn
real_news['Label'] = 0
fake_news['Label'] = 1
data = pd.concat([real_news, fake_news], ignore_index=True)

# 🚀 **Chia thành train (70%), validation (15%) và test (15%)**
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Content'], data['Label'], test_size=0.3, random_state=42, stratify=data['Label']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42, stratify=test_labels
)

# 🚀 **Tokenize dữ liệu**
train_inputs, train_mask, train_labels = preprocess_data(train_texts, train_labels)
val_inputs, val_mask, val_labels = preprocess_data(val_texts, val_labels)
test_inputs, test_mask, test_labels = preprocess_data(test_texts, test_labels)

# 🚀 **Tạo dataset TensorFlow**
# Smaller batch size for CPU
batch_size = 8 if not gpus else 16

# Optimize datasets for performance
train_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': train_inputs, 'attention_mask': train_mask}, train_labels)) \
    .cache() \
    .batch(batch_size) \
    .prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': val_inputs, 'attention_mask': val_mask}, val_labels)) \
    .cache() \
    .batch(batch_size) \
    .prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': test_inputs, 'attention_mask': test_mask}, test_labels)) \
    .cache() \
    .batch(batch_size) \
    .prefetch(tf.data.AUTOTUNE)

# 🚀 **Train model**
model = build_model()

# Create callback to monitor resource usage
class ResourceMonitor(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        print(f"\n💥 Resource usage before epoch {epoch+1}:")
        print(resource_usage())
    
    def on_epoch_end(self, epoch, logs=None):
        print(f"\n💥 Resource usage after epoch {epoch+1}:")
        print(resource_usage())

# Measure training time
start_time = time.time()

# Train with monitoring
history = model.fit(
    train_dataset, 
    validation_data=val_dataset, 
    epochs=3,
    callbacks=[ResourceMonitor()]
)

# Print training time
training_time = time.time() - start_time
print(f"\n💥 Total training time: {training_time:.2f} seconds")

# 🚀 **Đánh giá mô hình trên tập test**
test_loss, test_acc = model.evaluate(test_dataset)
print(f'Test Accuracy: {test_acc:.4f}')


💥 CPU Threading configured: 16 physical cores, 16 logical cores

💥 CHECKING GPU PREREQUISITES:
💥 NVIDIA-SMI Output:
Thu Mar 27 07:01:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 572.61                 Driver Version: 572.61         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   42C    P8             36W /  400W |    2770MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+-------------

Some layers from the model checkpoint at vinai/phobert-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/phobert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.



💥 Memory after loading PhoBERT:
['Memory usage: 3890.5 MB']

💥 Resource usage before epoch 1:
['Memory usage: 3801.4 MB']
Epoch 1/3
[1m 12/896[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14:43[0m 999ms/step - accuracy: 0.2448 - loss: 0.8557

In [None]:
# 💾 Lưu mô hình (Fixed model saving)
try:
    model.save('./model/fake_news_model.keras')
    print("✅ Model saved successfully")
except Exception as e:
    print(f"❌ Error saving model: {e}")
    # Alternative saving method
    print("Trying alternative save method...")
    model.save_weights('./model/fake_news_model.weights.h5')
    print("✅ Model weights saved successfully")


In [None]:
# 📊 Tạo hàm để dự đoán tin tức mới
def predict_news(text, model, tokenizer):
    # Đảm bảo text là string
    if not isinstance(text, str):
        text = str(text)
    
    # Tokenize văn bản với PhoBERT
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=256, return_tensors='tf')
    
    # Dự đoán
    prediction = model.predict({
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask']
    }, verbose=0)
    
    probability = prediction[0][0]
    
    # Diễn giải kết quả
    result = "FAKE" if probability >= 0.5 else "REAL"
    confidence = probability if probability >= 0.5 else 1 - probability
    
    return {
        'result': result,
        'confidence': float(confidence),
        'probability': float(probability)
    }

# 📱 Load mô hình đã lưu
def load_model_for_inference():
    try:
        # Define CustomPhoBERTLayer again for model loading
        class CustomPhoBERTLayer(tf.keras.layers.Layer):
            def __init__(self, phobert_model, **kwargs):
                super(CustomPhoBERTLayer, self).__init__(**kwargs)
                self.phobert = phobert_model
                self.phobert_name = "vinai/phobert-base"

            def call(self, inputs):
                input_ids, attention_mask = inputs
                output = self.phobert(input_ids=input_ids, attention_mask=attention_mask)[0]
                return output
                
            def get_config(self):
                config = super(CustomPhoBERTLayer, self).get_config()
                config.update({"phobert_name": self.phobert_name})
                return config
                
            @classmethod
            def from_config(cls, config):
                from transformers import TFAutoModel
                config_copy = dict(config)
                phobert_name = config_copy.pop("phobert_name")
                phobert_model = TFAutoModel.from_pretrained(phobert_name)
                phobert_model.trainable = False
                return cls(phobert_model, **config_copy)
                
        # Thử load full model
        loaded_model = tf.keras.models.load_model(
            './model/fake_news_model.keras',
            custom_objects={'CustomPhoBERTLayer': CustomPhoBERTLayer}
        )
        print("✅ Model loaded successfully")
        return loaded_model
    except Exception as e:
        print(f"❌ Error loading full model: {e}")
        print("Loading model from architecture and weights...")
        
        # Nếu không thành công, tạo lại model và load weights
        new_model = build_model()
        new_model.load_weights('./model/fake_news_model.weights.h5')
        print("✅ Model loaded from weights successfully")
        return new_model

# 🔍 Test với một số ví dụ
try:
    # Load model
    inference_model = load_model_for_inference()
    
    # Ví dụ về tin thật
    real_news_example = "Thủ tướng Phạm Minh Chính cho biết Việt Nam luôn coi trọng hợp tác với EU và đề nghị EU sớm hoàn tất phê chuẩn Hiệp định Bảo hộ đầu tư Việt Nam-EU."
    
    # Ví dụ về tin giả
    fake_news_example = "Nhà khoa học Việt Nam chế tạo thành công máy phát điện vĩnh cửu không cần nhiên liệu, có thể cung cấp điện miễn phí cho toàn bộ đất nước."
    
    # Dự đoán
    real_result = predict_news(real_news_example, inference_model, tokenizer)
    fake_result = predict_news(fake_news_example, inference_model, tokenizer)
    
    # In kết quả
    print("\n🔍 KIỂM TRA TIN THẬT:")
    print(f"Nội dung: {real_news_example}")
    print(f"Kết quả: {real_result['result']} (độ tin cậy: {real_result['confidence']:.2%})")
    
    print("\n🔍 KIỂM TRA TIN GIẢ:")
    print(f"Nội dung: {fake_news_example}")
    print(f"Kết quả: {fake_result['result']} (độ tin cậy: {fake_result['confidence']:.2%})")
    
except Exception as e:
    print(f"❌ Error during inference: {e}")


