<a href="https://colab.research.google.com/github/Anerol18/Fake_News_Detector_NLP_DeepLearning_Project/blob/main/fakenews_project_october_version_stella_nl_V5__from_lorena_with_cv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment setting

In [None]:
# Install project dependencies (run once per fresh runtime)\n%pip install -r requirements.txt\n

In [None]:
# Environment setting for Google Colab
#!pip install transformers sentence-transformers tqdm

from tqdm import tqdm

import copy
import random
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize


In [None]:
# Optional flash-attn install + environment diagnostics
import importlib
import platform
import subprocess
import sys
from importlib.metadata import PackageNotFoundError, version

def pkg_version(name):
    try:
        return version(name)
    except PackageNotFoundError:
        return 'not installed'

print('Python:', sys.version.split()[0])
print('Platform:', platform.platform())
print('Torch:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('CUDA device:', torch.cuda.get_device_name(0))

for pkg in ['numpy', 'pandas', 'transformers', 'scikit-learn', 'matplotlib', 'seaborn', 'tqdm']:
    print(f'{pkg}: {pkg_version(pkg)}')

if torch.cuda.is_available():
    try:
        import flash_attn  # noqa: F401
        print('flash-attn already installed')
    except Exception:
        print('flash-attn not installed. Attempting optional install...')
        cmd = [sys.executable, '-m', 'pip', 'install', 'flash-attn', '--no-build-isolation']
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode == 0:
            print('flash-attn install succeeded')
        else:
            print('flash-attn install failed; continuing without it')
            print(result.stderr[-800:])
else:
    print('Skipping flash-attn install because CUDA is not available')


In [None]:
# Select compute device with deterministic precedence: CUDA > MPS > CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("Using CPU device")

print(f"Device selected: {device}")


In [None]:
# Setting seed:
def set_seed_fun(seed_number: int):
    random.seed(seed_number)
    np.random.seed(seed_number)
    torch.manual_seed(seed_number)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_number)
        torch.cuda.manual_seed_all(seed_number)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed_fun(42)


In [None]:
# Load the embedding model (dunzhang/stella_en_1.5B_v5)
model_name = "dunzhang/stella_en_1.5B_v5"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


In [None]:
# Import dataset (prefer local file, fallback to remote URL)
from pathlib import Path
import hashlib

local_dataset_path = Path('final_combined_dataset.csv')
snapshot_hash_path = Path('final_combined_dataset.sha256')
remote_dataset_url = 'https://raw.githubusercontent.com/Anerol18/Fake_News_Detector_NLP_DeepLearning_Project/main/final_combined_dataset.csv'

if local_dataset_path.exists():
    print(f'Loading dataset from local file: {local_dataset_path}')
    df = pd.read_csv(local_dataset_path)

    current_hash = hashlib.sha256(local_dataset_path.read_bytes()).hexdigest()
    print(f'Local dataset SHA256: {current_hash}')

    if snapshot_hash_path.exists():
        expected_hash = snapshot_hash_path.read_text(encoding='utf-8').strip().split()[0]
        if current_hash != expected_hash:
            raise ValueError(
                f'Dataset hash mismatch. Expected {expected_hash}, got {current_hash}. '
                'Update final_combined_dataset.sha256 only if this dataset update is intentional.'
            )
        print('Dataset hash matches pinned snapshot')
    else:
        print('No pinned snapshot hash file found (final_combined_dataset.sha256)')
else:
    print('Local dataset not found. Falling back to remote URL.')
    df = pd.read_csv(remote_dataset_url)

required_columns = {'Text', 'Label'}
missing_columns = required_columns - set(df.columns)
if missing_columns:
    raise ValueError(f'Dataset is missing required columns: {missing_columns}')

# Basic cleanup for robust downstream processing
df = df.dropna(subset=['Text', 'Label']).copy()
df['Text'] = df['Text'].astype(str).str.strip()
df['Label'] = df['Label'].astype(str).str.strip().str.lower()
df = df[df['Text'] != ''].reset_index(drop=True)


In [None]:
# Prepare data
X = df['Text'].values.astype(str)
y = (df['Label'] == 'fake').astype(int).values


In [None]:
# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y,
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp,
)


# Embedding

In [None]:
# Modified function without dimension reduction
def generate_stella_embeddings(texts, tokenizer, model, batch_size=32):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch_texts = texts[i:i + batch_size]

        with torch.no_grad():
            inputs = tokenizer(batch_texts, padding="longest", truncation=True, max_length=512, return_tensors="pt").to(device)
            attention_mask = inputs["attention_mask"]
            outputs = model(**inputs)[0]
            last_hidden = outputs.masked_fill(~attention_mask[..., None].bool(), 0.0)
            embeddings_batch = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
            embeddings_batch = normalize(embeddings_batch.cpu().numpy())

            embeddings.append(embeddings_batch)

    return np.vstack(embeddings)

In [None]:
#####################################
# benchmark beginning for embedding #
#####################################
time_start_embed = time.perf_counter()

In [None]:
# Initialize the Vertex AI TextEmbeddingModel
# embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-004")

In [None]:
# Ensure data is in the correct format
X_train = X_train.tolist() if isinstance(X_train, np.ndarray) else X_train
X_val = X_val.tolist() if isinstance(X_val, np.ndarray) else X_val
X_test = X_test.tolist() if isinstance(X_test, np.ndarray) else X_test


In [None]:
# Generate embeddings for the train, validation, and test sets
X_train_embeddings = generate_stella_embeddings(X_train, tokenizer, model)
X_val_embeddings = generate_stella_embeddings(X_val, tokenizer, model)
X_test_embeddings = generate_stella_embeddings(X_test, tokenizer, model)


In [None]:
#####################################
# benchmark ending for embedding    #
#####################################
time_end_embed = time.perf_counter()

# Training part

## Class functions

Definition of functions to:
- Transform a data set into the good format
- create a simple neural network architecture
- create a funtion to transform seconds into a list of (hours, minutes, seconds)

In [None]:
# Define a Dataset class for PyTorch
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
         # Ensure X is a numeric tensor
        text_tensor = torch.tensor(self.X[idx], dtype=torch.float32)  # Make sure this is float
        label_tensor = torch.tensor(self.y[idx], dtype=torch.long)  # Labels should be long for classification
        return text_tensor, label_tensor

In [None]:
# input_size = 1536 / 768 / 384 / 192

In [None]:
# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc0 = nn.Linear(input_size, 3072)
        self.dropout0 = nn.Dropout(p=0.6)
        self.relu0 = nn.ReLU()
        self.fc01 = nn.Linear(3072, 3072)
        self.dropout01 = nn.Dropout(p=0.6)
        self.relu01 = nn.ReLU()
        self.fc1 = nn.Linear(3072, 768)
        self.dropout1 = nn.Dropout(p=0.6)
        self.relu1 = nn.ReLU()
        #self.fc11 = nn.Linear(768, 768)
        #self.dropout11 = nn.Dropout(p=0.6)
        #self.relu11 = nn.ReLU()
        #self.fc2 = nn.Linear(768, 384)
        #self.dropout2 = nn.Dropout(p=0.6)
        #self.relu2 = nn.ReLU()
        #self.fc21 = nn.Linear(384, 384)
        #self.dropout21 = nn.Dropout(p=0.6)
        #self.relu21 = nn.ReLU()
        self.fc3 = nn.Linear(768, 2)

    def forward(self, x):
        x = self.fc0(x)
        x = self.dropout0(x)
        x = self.relu0(x)
        x = self.fc01(x)
        x = self.dropout01(x)
        x = self.relu01(x)
        x = self.fc1(x)
        x = self.dropout1(x)
        x = self.relu1(x)
        #x = self.fc11(x)
        #x = self.dropout1(x)
        #x = self.relu1(x)
        #x = self.fc2(x)
        #x = self.dropout2(x)
        #x = self.relu2(x)
        #x = self.fc21(x)
        #x = self.dropout21(x)
        #x = self.relu21(x)
        x = self.fc3(x)
        return x


In [None]:
def sec2hms(ss):
	(hh, ss)=divmod(ss, 3600)
	(mm, ss)=divmod(ss, 60)
	return (hh, mm, ss)

## Training function

Definition of the training function.

In [None]:
# Function to train the model with stratified K-fold cross-validation

def train_model_cv(
    X_train,
    y_train,
    input_size,
    n_splits=5,
    num_epochs=40,
    batch_size=64,
    lr=2e-5,
    patience=2,
):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    weight_decay_values = [0, 0.0001, 0.001, 0.01, 0.1]
    results = {}

    for weight_decay in weight_decay_values:
        print(f"\n=== Weight decay: {weight_decay} ===")
        fold_results = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), start=1):
            print(f"Training on fold {fold}/{n_splits}")

            assert len(set(train_idx).intersection(set(val_idx))) == 0, "train/val indices overlap"
            assert len(train_idx) + len(val_idx) == len(X_train), "fold partition mismatch"

            X_fold_train = X_train[train_idx]
            y_fold_train = y_train[train_idx]
            X_fold_val = X_train[val_idx]
            y_fold_val = y_train[val_idx]

            train_dataset = NewsDataset(X_fold_train, y_fold_train)
            val_dataset = NewsDataset(X_fold_val, y_fold_val)

            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

            model = SimpleNN(input_size).to(device)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

            best_val_loss = float("inf")
            best_val_accuracy = 0.0
            best_epoch = 0
            best_state_dict = copy.deepcopy(model.state_dict())
            patience_counter = 0

            train_losses, val_losses = [], []
            train_accuracies, val_accuracies = [], []

            for epoch in range(num_epochs):
                model.train()
                running_loss = 0.0
                train_correct = 0
                train_total = 0

                for X_batch, y_batch in train_loader:
                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                    optimizer.zero_grad()
                    outputs = model(X_batch)
                    loss = criterion(outputs, y_batch)
                    loss.backward()
                    optimizer.step()

                    running_loss += loss.item()
                    preds = outputs.argmax(dim=1)
                    train_total += y_batch.size(0)
                    train_correct += (preds == y_batch).sum().item()

                avg_train_loss = running_loss / max(len(train_loader), 1)
                train_accuracy = train_correct / max(train_total, 1)
                train_losses.append(avg_train_loss)
                train_accuracies.append(train_accuracy)

                model.eval()
                val_loss_total = 0.0
                val_correct = 0
                val_total = 0
                with torch.no_grad():
                    for X_batch, y_batch in val_loader:
                        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                        outputs = model(X_batch)
                        loss = criterion(outputs, y_batch)
                        val_loss_total += loss.item()

                        preds = outputs.argmax(dim=1)
                        val_total += y_batch.size(0)
                        val_correct += (preds == y_batch).sum().item()

                avg_val_loss = val_loss_total / max(len(val_loader), 1)
                val_accuracy = val_correct / max(val_total, 1)
                val_losses.append(avg_val_loss)
                val_accuracies.append(val_accuracy)

                print(
                    f"Epoch {epoch + 1}/{num_epochs} | "
                    f"Train Loss: {avg_train_loss:.6f}, Train Acc: {train_accuracy:.6f}, "
                    f"Val Loss: {avg_val_loss:.6f}, Val Acc: {val_accuracy:.6f}"
                )

                if avg_val_loss < best_val_loss:
                    best_val_loss = avg_val_loss
                    best_val_accuracy = val_accuracy
                    best_epoch = epoch + 1
                    best_state_dict = copy.deepcopy(model.state_dict())
                    patience_counter = 0
                else:
                    patience_counter += 1

                if patience_counter >= patience:
                    print("Early stopping")
                    break

            model.load_state_dict(best_state_dict)

            fold_results.append(
                {
                    "train_losses": train_losses,
                    "val_losses": val_losses,
                    "train_accuracies": train_accuracies,
                    "val_accuracies": val_accuracies,
                    "best_val_loss": best_val_loss,
                    "best_val_accuracy": best_val_accuracy,
                    "best_epoch": best_epoch,
                }
            )

        results[weight_decay] = fold_results

    return results


# Model improvement

## Training

In [None]:
############################################
# benchmark beginning for Cross Validation #
############################################
time_start_cv = time.perf_counter()

In [None]:
input_size = 1536  # set from stella dimensions

# Quick smoke-run controls (set QUICK_RUN=False for full training)
QUICK_RUN = False
cv_n_splits = 3 if QUICK_RUN else 5
cv_num_epochs = 5 if QUICK_RUN else 40
retrain_num_epochs = 5 if QUICK_RUN else 40


In [None]:
# Train the model
results = train_model_cv(
    X_train_embeddings,
    y_train,
    input_size,
    n_splits=cv_n_splits,
    num_epochs=cv_num_epochs,
    batch_size=64,
    lr=2e-5,
    patience=2,
)


In [None]:
# Initialize variables to track best metrics
best_weight_decay = None
best_val_loss = float('inf')
best_val_accuracy = 0.0
avg_best_val_losses = []
avg_best_val_accuracies = []

for wd, fold_metrics in results.items():
    avg_best_val_loss = np.mean([fold['best_val_loss'] for fold in fold_metrics])
    avg_best_val_accuracy = np.mean([fold['best_val_accuracy'] for fold in fold_metrics])

    avg_best_val_losses.append(avg_best_val_loss)
    avg_best_val_accuracies.append(avg_best_val_accuracy)

    if avg_best_val_loss < best_val_loss:
        best_val_loss = avg_best_val_loss
        best_weight_decay = wd
        best_val_accuracy = avg_best_val_accuracy

print(
    f"Best Weight Decay: {best_weight_decay:.6f}, "
    f"Best CV Validation Loss: {best_val_loss:.6f}, "
    f"Best CV Validation Accuracy: {best_val_accuracy:.6f}"
)
print("Average Best Validation Losses: ", ["{:.6f}".format(item) for item in avg_best_val_losses])
print("Average Best Validation Accuracies: ", ["{:.6f}".format(item) for item in avg_best_val_accuracies])


In [None]:
#########################################
# Benchmark ending for Cross Validation #
#########################################
time_end_cv = time.perf_counter()

In [None]:
# Visualize the results
plt.figure(figsize=(12, 6))

# Iterate through each weight decay and its corresponding fold metrics
for wd, fold_metrics in results.items():
    # Get the lengths of val_losses for each fold
    lengths = [len(fold['val_losses']) for fold in fold_metrics]
    # Find the minimum length
    min_length = min(lengths)
    # Truncate val_losses to the minimum length for consistent shapes
    truncated_val_losses = [fold['val_losses'][:min_length] for fold in fold_metrics]

    # Compute the average validation loss across all folds for each epoch using the truncated lists
    avg_val_losses = np.mean(truncated_val_losses, axis=0)  # Average over folds

    plt.plot(avg_val_losses, label=f'Weight Decay: {wd}', marker='o')  # Adding marker for better visibility

plt.title('Average Validation Loss vs. Epochs for Different Weight Decay Values')
plt.xlabel('Epochs')
plt.ylabel('Average Validation Loss')
plt.legend()
plt.grid(True)  # Add grid for better readability
plt.ylim(bottom=0)  # Ensure y-axis starts at 0 for better visibility
plt.savefig("Cross Validation on Weight Decay.png", transparent=True)
plt.show()


## Retrain the model using the best AdamW decay

In [None]:
# Function to retrain the model using the best weight decay

def retrain_with_best_decay(
    X_train,
    y_train,
    X_val,
    y_val,
    input_size,
    best_weight_decay,
    num_epochs=40,
    batch_size=64,
    lr=2e-5,
    patience=2,
):
    train_dataset = NewsDataset(X_train, y_train)
    val_dataset = NewsDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = SimpleNN(input_size).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=best_weight_decay)

    best_val_loss = float('inf')
    best_epoch = 0
    best_state_dict = copy.deepcopy(model.state_dict())
    patience_counter = 0

    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        train_correct = 0
        train_total = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            preds = outputs.argmax(dim=1)
            train_total += y_batch.size(0)
            train_correct += (preds == y_batch).sum().item()

        avg_train_loss = running_loss / max(len(train_loader), 1)
        train_accuracy = train_correct / max(train_total, 1)

        train_losses.append(avg_train_loss)
        train_accuracies.append(train_accuracy)

        model.eval()
        val_loss_total = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss_total += loss.item()

                preds = outputs.argmax(dim=1)
                val_total += y_batch.size(0)
                val_correct += (preds == y_batch).sum().item()

        avg_val_loss = val_loss_total / max(len(val_loader), 1)
        val_accuracy = val_correct / max(val_total, 1)

        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)

        print(
            f"Epoch {epoch + 1}/{num_epochs} | "
            f"Train Loss: {avg_train_loss:.6f}, Train Acc: {train_accuracy:.6f}, "
            f"Val Loss: {avg_val_loss:.6f}, Val Acc: {val_accuracy:.6f}"
        )

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_epoch = epoch + 1
            best_state_dict = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("Early stopping")
            break

    model.load_state_dict(best_state_dict)
    print(f"Restored best model checkpoint from epoch {best_epoch} with val loss {best_val_loss:.6f}")

    return model, train_losses, val_losses, train_accuracies, val_accuracies


In [None]:
##########################################
# Benchmark beginning for best modeling  #
##########################################
time_start_model = time.perf_counter()

In [None]:
input_size = 1536

In [None]:
# #Coverting into int
# label_mapping = {'real': 0, 'fake': 1}

# # Convert y_train and y_val only if they are strings
# y_train = [label_mapping.get(label, label) if isinstance(label, str) else label for label in y_train]
# y_val = [label_mapping.get(label, label) if isinstance(label, str) else label for label in y_val]

# # Ensure all elements are integers before creating tensors
# y_train = [int(label) for label in y_train]  # Convert all elements to integers
# y_val = [int(label) for label in y_val]  # Convert all elements to integers


In [None]:
# Train the model with the best weight decay
model, train_losses, val_losses, train_accuracies, val_accuracies = retrain_with_best_decay(
    X_train_embeddings,
    y_train,
    X_val_embeddings,
    y_val,
    input_size,
    best_weight_decay,
    num_epochs=retrain_num_epochs,
    batch_size=64,
    lr=2e-5,
    patience=2,
)


In [None]:
######################################
# Benchmark ending for best modeling #
######################################
time_end_model = time.perf_counter()

## Evaluating


In [None]:
# Function to evaluate the model on the test set

def evaluate_model(model, X_test, y_test):
    model.eval()
    test_dataset = NewsDataset(X_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    y_pred = []
    y_score = []

    with torch.no_grad():
        for X_batch, _ in test_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            probs = torch.softmax(outputs, dim=1)
            preds = probs.argmax(dim=1)

            y_pred.extend(preds.cpu().numpy())
            y_score.extend(probs[:, 1].cpu().numpy())

    return np.array(y_pred), np.array(y_score)


# Evaluate the model on the test set
y_pred, y_score = evaluate_model(model, X_test_embeddings, y_test)

# Metric integrity checks
assert len(y_score) == len(y_test), "score/prediction length mismatch"
assert np.all((y_score >= 0.0) & (y_score <= 1.0)), "predicted probabilities are out of [0, 1] range"


In [None]:
# Evaluate performance
acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=["real", "fake"])
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {acc:.6f}")
print(f"Classification Report:\n{class_report}")
print(f"Confusion Matrix:\n{conf_matrix}")

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=["real", "fake"], yticklabels=["real", "fake"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('Confusion Matrix.png', transparent=True)
plt.show()


In [None]:
# Plot the loss curves
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.savefig('Loss curves.png', transparent=True)

# Plot the accuracy curves
plt.subplot(1, 2, 2)
plt.plot(range(1, len(train_accuracies) + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, len(val_accuracies) + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.tight_layout()
plt.grid(True)
plt.savefig('Accuracy curves.png', transparent=True)
plt.show()


In [None]:
"""
print("Train Losses:", train_losses)
print("Validation Losses:",val_losses)
print("Train Accuracies:", train_accuracies)
print("Validation Accuracies:", val_accuracies)
"""

In [None]:
print('Train Losses: ', ['{:.6f}'.format(item) for item in train_losses])
print('Validation Losses: ', ['{:.6f}'.format(item) for item in val_losses])
print('Train Accuracies: ', ['{:.6f}'.format(item) for item in train_accuracies])
print('Validation Accuracies: ', ['{:.6f}'.format(item) for item in val_accuracies])

In [None]:
# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.savefig('ROC curve.png', transparent=True)
plt.show()


In [None]:
print(f'ROC AUC: {roc_auc:.6f}')

## Benchmark results

In [None]:
#####################################
#          Benchmark results        #
#####################################
required_timers = [
    'time_start_embed',
    'time_end_embed',
    'time_start_cv',
    'time_end_cv',
    'time_start_model',
    'time_end_model',
]

missing_timers = [name for name in required_timers if name not in globals()]
if missing_timers:
    raise RuntimeError(
        'Missing timer variables for benchmark summary. Run the full notebook in order. '
        f'Missing: {missing_timers}'
    )

# calculating the performances
embedding_duration = time_end_embed - time_start_embed
cv_duration = time_end_cv - time_start_cv
modeling_duration = time_end_model - time_start_model

# formating
embedding_duration_hms = sec2hms(embedding_duration)
cv_duration_hms = sec2hms(cv_duration)
modeling_duration_hms = sec2hms(modeling_duration)

# printing the embedding, cross validation and modeling performances
print(f'Embedding duration : {embedding_duration_hms[0]:.0f}:{embedding_duration_hms[1]:.0f}:{embedding_duration_hms[2]:.3f}')
print(f'Cross validation duration : {cv_duration_hms[0]:.0f}:{cv_duration_hms[1]:.0f}:{cv_duration_hms[2]:.3f}')
print(f'Best modeling duration : {modeling_duration_hms[0]:.0f}:{modeling_duration_hms[1]:.0f}:{modeling_duration_hms[2]:.3f}')


In [None]:
# Save the model state dictionary and reproducibility metadata
import json

model_path = 'stella_model.pth'
config_path = 'stella_model_config.json'

torch.save(model.state_dict(), model_path)

artifact_config = {
    'model_file': model_path,
    'embedding_model_name': model_name,
    'classifier_architecture': {
        'input_size': int(input_size),
        'hidden_layers': [3072, 3072, 768],
        'dropout': 0.6,
        'output_classes': 2,
    },
    'label_mapping': {
        'real': 0,
        'fake': 1,
    },
    'training': {
        'best_weight_decay': float(best_weight_decay),
        'cv_n_splits': int(cv_n_splits),
        'cv_num_epochs': int(cv_num_epochs),
        'retrain_num_epochs': int(retrain_num_epochs),
        'seed': 42,
    },
    'dataset': {
        'local_csv': 'final_combined_dataset.csv',
        'snapshot_hash_file': 'final_combined_dataset.sha256',
        'num_rows_after_cleanup': int(len(df)),
    },
}

with open(config_path, 'w', encoding='utf-8') as f:
    json.dump(artifact_config, f, indent=2)

print(f'Model saved: {model_path}')
print(f'Metadata saved: {config_path}')
