In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def train_bert_model():
    """
    This function downloads the dataset, preprocesses the data,
    fine-tunes a BERT model for fake news classification,
    and saves the trained model and tokenizer.
    """
    # --- 1. Load Dataset ---
    # Using a well-known fake news dataset from Kaggle.
    # Note: This file is ~44MB.
    logger.info("Downloading dataset...")
    try:
        # The dataset contains two files: True.csv and Fake.csv
        true_df = pd.read_csv("../News_dataset/True.csv")
        fake_df = pd.read_csv("../News_dataset/Fake.csv")
    except Exception as e:
        logger.error(f"Failed to download or read the dataset. Error: {e}")
        logger.error("Please ensure you have an internet connection and the URL is correct.")
        return

    # --- 2. Preprocess Data ---
    logger.info("Preprocessing data...")
    # Add labels: 1 for 'real', 0 for 'fake'
    true_df['label'] = 1
    fake_df['label'] = 0

    # Combine the dataframes
    df = pd.concat([true_df, fake_df], ignore_index=True)

    # Combine title and text for a more comprehensive input
    df['text'] = df['title'] + " " + df['text']
    df = df[['text', 'label']]

    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # For demonstration, we'll use a smaller subset of the data to speed up training.
    # You can increase this for better accuracy.
    df = df.head(5000)

    # Split into training and validation sets
    X = df['text'].tolist()
    y = df['label'].tolist()
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- 3. Tokenization ---
    logger.info("Tokenizing data...")
    # Use the 'bert-base-uncased' tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the text data
    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
    val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=128)

    # Convert to TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        y_train
    ))
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        y_val
    ))

    # --- 4. Model Training ---
    logger.info("Initializing and training the BERT model...")
    # Load the pre-trained BERT model for sequence classification
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Define training parameters
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # Batch and shuffle the datasets
    train_dataset_batched = train_dataset.shuffle(1000).batch(16)
    val_dataset_batched = val_dataset.batch(16)

    # Fine-tune the model
    # For a real-world scenario, you might train for more epochs (e.g., 3-5).
    # We use 1 epoch here for a quicker demonstration.
    model.fit(train_dataset_batched, epochs=1, validation_data=val_dataset_batched)

    # --- 5. Save the Model and Tokenizer ---
    logger.info("Saving the fine-tuned model and tokenizer...")
    save_directory = './saved_model'
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained(save_directory)
    logger.info(f"Model and tokenizer saved in '{save_directory}'")

In [2]:
! pip install ipywidgets --upgrade

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m981.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [ipywidgets]3[0m [ipywidgets]
[1A[2KSuccessfully installed ipywidgets-8.1.7 jupyterlab_widgets-3.0.15 widgetsnbextension-4.0.14


In [4]:
train_bert_model()

INFO:__main__:Downloading dataset...
INFO:__main__:Preprocessing data...
INFO:__main__:Tokenizing data...
W0000 00:00:1757173118.718134  100959 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
INFO:__main__:Initializing and training the BERT model...
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




INFO:__main__:Saving the fine-tuned model and tokenizer...
INFO:__main__:Model and tokenizer saved in './saved_model'
