In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import logging

# ----------------------------
# Logging Setup
# ----------------------------
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def train_bert_model():
    """
    Downloads dataset, preprocesses, fine-tunes BERT on fake news detection,
    and saves the trained model + tokenizer.
    """

    # --- 1. Check GPU ---
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        logger.info(f"GPUs detected: {gpus}")
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logger.info("GPU is enabled for training ✅")
        except RuntimeError as e:
            logger.error(e)
    else:
        logger.warning("No GPU detected ❌ — training will run on CPU.")

    # --- 2. Load Dataset ---
    logger.info("Loading dataset...")
    try:
        true_df = pd.read_csv("../News_dataset/True.csv")
        fake_df = pd.read_csv("../News_dataset/Fake.csv")
    except Exception as e:
        logger.error(f"Failed to load dataset. Error: {e}")
        return

    # --- 3. Preprocess ---
    logger.info("Preprocessing data...")
    true_df['label'] = 1
    fake_df['label'] = 0
    df = pd.concat([true_df, fake_df], ignore_index=True)

    df['text'] = df['title'] + " " + df['text']
    df = df[['text', 'label']]

    # Shuffle
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Use subset for demo
    df = df.head(5000)

    # Train-test split
    X_train, X_val, y_train, y_val = train_test_split(
        df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
    )

    # --- 4. Tokenization ---
    logger.info("Tokenizing data...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
    val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=128)

    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        y_train
    ))
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        y_val
    ))

    # --- 5. Model ---
    logger.info("Loading BERT model...")
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    train_dataset_batched = train_dataset.shuffle(1000).batch(16)
    val_dataset_batched = val_dataset.batch(16)

    # --- 6. Training ---
    logger.info("Starting training...")
    with tf.device('/GPU:0' if gpus else '/CPU:0'):
        model.fit(train_dataset_batched, epochs=1, validation_data=val_dataset_batched)

    # --- 7. Save Model ---
    logger.info("Saving fine-tuned model and tokenizer...")
    save_directory = './saved_model'
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained(save_directory)
    logger.info(f"✅ Model and tokenizer saved in '{save_directory}'")





2025-09-06 21:16:42.382169: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757173602.480641  127835 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757173602.511699  127835 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757173602.551209  127835 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1757173602.551271  127835 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1757173602.551279  127835 computation_placer.cc:177] computation placer alr

In [2]:
train_bert_model()

W0000 00:00:1757173623.409968  127835 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
INFO:__main__:Loading dataset...
INFO:__main__:Preprocessing data...
INFO:__main__:Tokenizing data...


KeyboardInterrupt: 