In [None]:
from transformers import AutoTokenizer, TFAutoModel
import os
import time
import pandas as pd
from sklearn.model_selection import train_test_split

# Set CPU threading configuration BEFORE importing TensorFlow
num_physical_cores = os.cpu_count()
num_logical_cores = os.cpu_count()
if num_physical_cores is not None:
    # These environment variables need to be set before TensorFlow is imported
    os.environ["TF_NUM_INTEROP_THREADS"] = str(num_physical_cores)
    os.environ["TF_NUM_INTRAOP_THREADS"] = str(num_logical_cores)
    print(f"üí• CPU Threading configured: {num_physical_cores} physical cores, {num_logical_cores} logical cores")

# GPU diagnostic code - Add before TensorFlow import
import subprocess
import sys

def check_nvidia_gpu():
    try:
        nvidia_output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
        print("üí• NVIDIA-SMI Output:")
        print(nvidia_output)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("‚ùå nvidia-smi command failed - NVIDIA driver may not be properly installed")
        return False

def check_cuda_installation():
    try:
        nvcc_output = subprocess.check_output(['nvcc', '--version'], stderr=subprocess.STDOUT).decode('utf-8')
        print("üí• CUDA Compiler Version:")
        print(nvcc_output)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("‚ùå CUDA toolkit not found in PATH - CUDA may not be properly installed")
        return False

print("\nüí• CHECKING GPU PREREQUISITES:")
has_nvidia_driver = check_nvidia_gpu()
has_cuda = check_cuda_installation()

# Check Python and TensorFlow paths (to detect potential environment issues)
print(f"\nüí• Python executable: {sys.executable}")
print(f"üí• Python version: {sys.version}")

# Now import TensorFlow after setting thread configurations
import tensorflow as tf
print(f"\nüí• TensorFlow version: {tf.__version__}")
print(f"üí• TensorFlow built with CUDA: {tf.test.is_built_with_cuda()}")
print(f"üí• TensorFlow GPU available: {tf.test.is_gpu_available()}")

if has_nvidia_driver and not tf.test.is_gpu_available():
    print("\n‚ùó POTENTIAL ISSUE DETECTED:")
    print("   - NVIDIA GPU detected by system but not by TensorFlow")
    print("   - This may be caused by:")
    print("     1. Using CPU-only TensorFlow instead of GPU version")
    print("     2. Incompatible CUDA or cuDNN versions")
    print("     3. Environment configuration issues")
    print("\nüí° RECOMMENDATIONS:")
    print("   - Ensure you have installed tensorflow-gpu or tensorflow>=2.1 with pip")
    print("   - Check compatible CUDA/cuDNN versions for your TensorFlow version")
    print("   - Try: pip install tensorflow==2.10.0 (or another recent version)")
    print("   - For manual GPU setup, see: https://www.tensorflow.org/install/gpu")

# üöÄ **Ki·ªÉm tra v√† c·∫•u h√¨nh CPU**
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth for all GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"üí• Detected {len(gpus)} GPU(s):")
        for i, gpu in enumerate(gpus):
            print(f"  GPU {i}: {gpu.name}")
        
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(f"üí• {len(logical_gpus)} Logical GPU(s) available")
        
        # üöÄ **B·∫≠t Mixed Precision cho GPU ƒë·ªÉ tƒÉng t·ªëc**
        tf.keras.mixed_precision.set_global_policy("mixed_float16")
        print(f"üí• Mixed precision policy: {tf.keras.mixed_precision.global_policy()}")
        
        # üöÄ **K√≠ch ho·∫°t XLA compiler ƒë·ªÉ tƒÉng hi·ªáu su·∫•t tr√™n GPU**
        tf.config.optimizer.set_jit(True)
        print("üí• XLA JIT compilation enabled")
    except RuntimeError as e:
        print(f"‚ùå GPU error: {e}")
else:
    print("‚ùå No GPU detected by TensorFlow. Running on CPU.")
    # Configure for CPU - use float32 for better compatibility
    tf.keras.mixed_precision.set_global_policy("float32")
    print(f"üí• Mixed precision policy: {tf.keras.mixed_precision.global_policy()}")
    print(f"üí• CPU Optimization: Using {os.environ.get('TF_NUM_INTEROP_THREADS')} inter-op threads, {os.environ.get('TF_NUM_INTRAOP_THREADS')} intra-op threads")
        
    # Disable XLA which isn't needed for CPU
    tf.config.optimizer.set_jit(False)
    print("üí• XLA JIT compilation disabled for CPU")

# CPU memory monitor function
def cpu_memory_usage():
    try:
        import psutil
        process = psutil.Process(os.getpid())
        memory_info = process.memory_info()
        return f"Memory usage: {memory_info.rss / (1024 * 1024):.1f} MB"
    except:
        return "Memory monitoring not available"

# Resource monitoring function that works for both CPU and GPU
def resource_usage():
    if gpus:
        try:
            import subprocess
            result = subprocess.check_output(
                ['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,nounits,noheader']
            ).decode('utf-8')
            memory_usage = [tuple(map(int, x.split(','))) for x in result.strip().split('\n')]
            return [f"GPU {i}: {used} MB / {total} MB ({used/total:.1%})" for i, (used, total) in enumerate(memory_usage)]
        except:
            return ["GPU memory monitoring not available"]
    else:
        return [cpu_memory_usage()]

# üî• **Load PhoBERT tokenizer v√† model**
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
phobert.trainable = False  # ‚ö° ƒê√≥ng bƒÉng PhoBERT

# Print memory after loading model
print("\nüí• Memory after loading PhoBERT:")
print(resource_usage())

# üöÄ **H√†m ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu**
def preprocess_data(texts, labels):
    # Convert to list if they're Series objects
    if hasattr(texts, 'tolist'):
        texts = texts.tolist()
    if hasattr(labels, 'tolist'):
        labels = labels.tolist()

    # Clean the text data - ensure all are strings
    cleaned_texts = []
    cleaned_labels = []
    
    for i, (text, label) in enumerate(zip(texts, labels)):
        # Skip None or NaN values
        if text is None or (isinstance(text, float) and pd.isna(text)):
            print(f"Warning: Skipping item {i} with None/NaN text")
            continue
            
        # Convert to string if not already
        if not isinstance(text, str):
            text = str(text)
            
        cleaned_texts.append(text)
        cleaned_labels.append(label)
    
    if not cleaned_texts:
        raise ValueError("No valid text entries found after cleaning")
        
    # Tokenize vƒÉn b·∫£n v·ªõi PhoBERT
    inputs = tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=256, return_tensors='tf')

    return tf.convert_to_tensor(inputs['input_ids'], dtype=tf.int32), \
           tf.convert_to_tensor(inputs['attention_mask'], dtype=tf.int32), \
           tf.convert_to_tensor(cleaned_labels, dtype=tf.float32)


# üî• **T·∫°o l·ªõp Keras t√πy ch·ªânh cho PhoBERT**
class CustomPhoBERTLayer(tf.keras.layers.Layer):
    def __init__(self, phobert_model, **kwargs):
        super(CustomPhoBERTLayer, self).__init__(**kwargs)
        self.phobert = phobert_model
        self.phobert_name = "vinai/phobert-base"  # Store the name for serialization

    def call(self, inputs):
        input_ids, attention_mask = inputs
        output = self.phobert(input_ids=input_ids, attention_mask=attention_mask)[0]
        return output
        
    def get_config(self):
        config = super(CustomPhoBERTLayer, self).get_config()
        config.update({"phobert_name": self.phobert_name})
        return config
        
    @classmethod
    def from_config(cls, config):
        # Load PhoBERT when reconstructing the layer
        from transformers import TFAutoModel
        config_copy = dict(config)
        phobert_name = config_copy.pop("phobert_name")
        phobert_model = TFAutoModel.from_pretrained(phobert_name)
        phobert_model.trainable = False
        return cls(phobert_model, **config_copy)


# üî• **H√†m x√¢y d·ª±ng m√¥ h√¨nh PhoBERT**
def build_model():
    input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='attention_mask')

    # ‚úÖ **D√πng l·ªõp CustomPhoBERTLayer thay v√¨ Lambda**
    phobert_output = CustomPhoBERTLayer(phobert)([input_ids, attention_mask])

    # üìå **L·∫•y embedding t·ª´ token ƒë·∫ßu ti√™n [CLS]**
    text_embedding = tf.keras.layers.Lambda(lambda x: x[:, 0, :])(phobert_output)

    dropout = tf.keras.layers.Dropout(0.1)(text_embedding)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


# üöÄ **Load d·ªØ li·ªáu**
real_news = pd.read_csv('./data/vnexpress_dataset.csv')
fake_news = pd.read_csv('./data/vnexpress_fake_dataset.csv')

# G√°n nh√£n
real_news['Label'] = 0
fake_news['Label'] = 1
data = pd.concat([real_news, fake_news], ignore_index=True)

# üöÄ **Chia th√†nh train (70%), validation (15%) v√† test (15%)**
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Content'], data['Label'], test_size=0.3, random_state=42, stratify=data['Label']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42, stratify=test_labels
)

# üöÄ **Tokenize d·ªØ li·ªáu**
train_inputs, train_mask, train_labels = preprocess_data(train_texts, train_labels)
val_inputs, val_mask, val_labels = preprocess_data(val_texts, val_labels)
test_inputs, test_mask, test_labels = preprocess_data(test_texts, test_labels)

# üöÄ **T·∫°o dataset TensorFlow**
# Smaller batch size for CPU
batch_size = 8 if not gpus else 16

# Optimize datasets for performance
train_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': train_inputs, 'attention_mask': train_mask}, train_labels)) \
    .cache() \
    .batch(batch_size) \
    .prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': val_inputs, 'attention_mask': val_mask}, val_labels)) \
    .cache() \
    .batch(batch_size) \
    .prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': test_inputs, 'attention_mask': test_mask}, test_labels)) \
    .cache() \
    .batch(batch_size) \
    .prefetch(tf.data.AUTOTUNE)

# üöÄ **Train model**
model = build_model()

# Create callback to monitor resource usage
class ResourceMonitor(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        print(f"\nüí• Resource usage before epoch {epoch+1}:")
        print(resource_usage())
    
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nüí• Resource usage after epoch {epoch+1}:")
        print(resource_usage())

# Measure training time
start_time = time.time()

# Train with monitoring
history = model.fit(
    train_dataset, 
    validation_data=val_dataset, 
    epochs=3,
    callbacks=[ResourceMonitor()]
)

# Print training time
training_time = time.time() - start_time
print(f"\nüí• Total training time: {training_time:.2f} seconds")

# üöÄ **ƒê√°nh gi√° m√¥ h√¨nh tr√™n t·∫≠p test**
test_loss, test_acc = model.evaluate(test_dataset)
print(f'Test Accuracy: {test_acc:.4f}')


üí• CPU Threading configured: 16 physical cores, 16 logical cores

üí• CHECKING GPU PREREQUISITES:
üí• NVIDIA-SMI Output:
Thu Mar 27 07:01:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 572.61                 Driver Version: 572.61         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   42C    P8             36W /  400W |    2770MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+----

Some layers from the model checkpoint at vinai/phobert-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/phobert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.



üí• Memory after loading PhoBERT:
['Memory usage: 3890.5 MB']

üí• Resource usage before epoch 1:
['Memory usage: 3801.4 MB']
Epoch 1/3
[1m 12/896[0m [37m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [1m14:43[0m 999ms/step - accuracy: 0.2448 - loss: 0.8557

In [None]:
# üíæ L∆∞u m√¥ h√¨nh (Fixed model saving)
try:
    model.save('./model/fake_news_model.keras')
    print("‚úÖ Model saved successfully")
except Exception as e:
    print(f"‚ùå Error saving model: {e}")
    # Alternative saving method
    print("Trying alternative save method...")
    model.save_weights('./model/fake_news_model.weights.h5')
    print("‚úÖ Model weights saved successfully")


In [None]:
# üìä T·∫°o h√†m ƒë·ªÉ d·ª± ƒëo√°n tin t·ª©c m·ªõi
def predict_news(text, model, tokenizer):
    # ƒê·∫£m b·∫£o text l√† string
    if not isinstance(text, str):
        text = str(text)
    
    # Tokenize vƒÉn b·∫£n v·ªõi PhoBERT
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=256, return_tensors='tf')
    
    # D·ª± ƒëo√°n
    prediction = model.predict({
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask']
    }, verbose=0)
    
    probability = prediction[0][0]
    
    # Di·ªÖn gi·∫£i k·∫øt qu·∫£
    result = "FAKE" if probability >= 0.5 else "REAL"
    confidence = probability if probability >= 0.5 else 1 - probability
    
    return {
        'result': result,
        'confidence': float(confidence),
        'probability': float(probability)
    }

# üì± Load m√¥ h√¨nh ƒë√£ l∆∞u
def load_model_for_inference():
    try:
        # Define CustomPhoBERTLayer again for model loading
        class CustomPhoBERTLayer(tf.keras.layers.Layer):
            def __init__(self, phobert_model, **kwargs):
                super(CustomPhoBERTLayer, self).__init__(**kwargs)
                self.phobert = phobert_model
                self.phobert_name = "vinai/phobert-base"

            def call(self, inputs):
                input_ids, attention_mask = inputs
                output = self.phobert(input_ids=input_ids, attention_mask=attention_mask)[0]
                return output
                
            def get_config(self):
                config = super(CustomPhoBERTLayer, self).get_config()
                config.update({"phobert_name": self.phobert_name})
                return config
                
            @classmethod
            def from_config(cls, config):
                from transformers import TFAutoModel
                config_copy = dict(config)
                phobert_name = config_copy.pop("phobert_name")
                phobert_model = TFAutoModel.from_pretrained(phobert_name)
                phobert_model.trainable = False
                return cls(phobert_model, **config_copy)
                
        # Th·ª≠ load full model
        loaded_model = tf.keras.models.load_model(
            './model/fake_news_model.keras',
            custom_objects={'CustomPhoBERTLayer': CustomPhoBERTLayer}
        )
        print("‚úÖ Model loaded successfully")
        return loaded_model
    except Exception as e:
        print(f"‚ùå Error loading full model: {e}")
        print("Loading model from architecture and weights...")
        
        # N·∫øu kh√¥ng th√†nh c√¥ng, t·∫°o l·∫°i model v√† load weights
        new_model = build_model()
        new_model.load_weights('./model/fake_news_model.weights.h5')
        print("‚úÖ Model loaded from weights successfully")
        return new_model

# üîç Test v·ªõi m·ªôt s·ªë v√≠ d·ª•
try:
    # Load model
    inference_model = load_model_for_inference()
    
    # V√≠ d·ª• v·ªÅ tin th·∫≠t
    real_news_example = "Th·ªß t∆∞·ªõng Ph·∫°m Minh Ch√≠nh cho bi·∫øt Vi·ªát Nam lu√¥n coi tr·ªçng h·ª£p t√°c v·ªõi EU v√† ƒë·ªÅ ngh·ªã EU s·ªõm ho√†n t·∫•t ph√™ chu·∫©n Hi·ªáp ƒë·ªãnh B·∫£o h·ªô ƒë·∫ßu t∆∞ Vi·ªát Nam-EU."
    
    # V√≠ d·ª• v·ªÅ tin gi·∫£
    fake_news_example = "Nh√† khoa h·ªçc Vi·ªát Nam ch·∫ø t·∫°o th√†nh c√¥ng m√°y ph√°t ƒëi·ªán vƒ©nh c·ª≠u kh√¥ng c·∫ßn nhi√™n li·ªáu, c√≥ th·ªÉ cung c·∫•p ƒëi·ªán mi·ªÖn ph√≠ cho to√†n b·ªô ƒë·∫•t n∆∞·ªõc."
    
    # D·ª± ƒëo√°n
    real_result = predict_news(real_news_example, inference_model, tokenizer)
    fake_result = predict_news(fake_news_example, inference_model, tokenizer)
    
    # In k·∫øt qu·∫£
    print("\nüîç KI·ªÇM TRA TIN TH·∫¨T:")
    print(f"N·ªôi dung: {real_news_example}")
    print(f"K·∫øt qu·∫£: {real_result['result']} (ƒë·ªô tin c·∫≠y: {real_result['confidence']:.2%})")
    
    print("\nüîç KI·ªÇM TRA TIN GI·∫¢:")
    print(f"N·ªôi dung: {fake_news_example}")
    print(f"K·∫øt qu·∫£: {fake_result['result']} (ƒë·ªô tin c·∫≠y: {fake_result['confidence']:.2%})")
    
except Exception as e:
    print(f"‚ùå Error during inference: {e}")


