In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW # <-- التعديل: AdamW تم استيراده من torch.optim
import os

In [3]:
# --- 1. USER SETTINGS ---
BATCH_SIZE = 16 
NUM_LABELS = 2

In [4]:
# Define paths to the tokenized data
TOKENIZED_FOLDER = '../tokenized_data/'
TRAIN_ENCODINGS_PATH = os.path.join(TOKENIZED_FOLDER, 'train_encodings.pt')
VAL_ENCODINGS_PATH = os.path.join(TOKENIZED_FOLDER, 'val_encodings.pt')
TEST_ENCODINGS_PATH = os.path.join(TOKENIZED_FOLDER, 'test_encodings.pt')

In [5]:
# Load the tokenized data 
try:
    train_encodings = torch.load(TRAIN_ENCODINGS_PATH, weights_only=False) 
    val_encodings = torch.load(VAL_ENCODINGS_PATH, weights_only=False)
    test_encodings = torch.load(TEST_ENCODINGS_PATH, weights_only=False)
    print("Tokenized data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: Tokenized files not found. Ensure the 'tokenized_data/' folder is correct. Details: {e}")
    exit()

Tokenized data loaded successfully.


In [6]:
# --- 2. Create Custom PyTorch Dataset Class ---
class NewsDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

In [7]:
# --- 3. Instantiate Datasets ---
train_dataset = NewsDataset(train_encodings)
val_dataset = NewsDataset(val_encodings)
test_dataset = NewsDataset(test_encodings)

print(f"\nTraining Dataset size: {len(train_dataset)}")
print(f"Validation Dataset size: {len(val_dataset)}")
print(f"Testing Dataset size: {len(test_dataset)}")


Training Dataset size: 13000
Validation Dataset size: 3000
Testing Dataset size: 4000


In [8]:
# --- 4. Create DataLoaders ---
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [9]:
# --- 5. Verification Check ---
print("\n--- DataLoader Verification ---")
print(f"Total batches in Training DataLoader: {len(train_loader)}") 
print(f"Total examples in Training DataLoader: {len(train_loader.dataset)} (13000 total)")
print(f"Batch Size used: {BATCH_SIZE}")


--- DataLoader Verification ---
Total batches in Training DataLoader: 813
Total examples in Training DataLoader: 13000 (13000 total)
Batch Size used: 16


In [10]:
# --- 6. Model Initialization (Preview of next step) ---
MODEL_NAME = 'bert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
print(f"\nBERT Model structure loaded for {NUM_LABELS} labels (Fake/Real).")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERT Model structure loaded for 2 labels (Fake/Real).


In [None]:
# Note: We will use a multi-task model (Binary Classification + Category Classification)
# For simplicity here, we only initialize for the main task (Fake/Real)
# We will adjust the model structure in the next step.

# binary_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
# print(f"BERT Model structure loaded for 2 labels (Fake/Real).")

In [11]:
print("\nProject is now ready for Model Fine-tuning (Step 05).")


Project is now ready for Model Fine-tuning (Step 05).
