In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-liar-dataset/data.csv


In [14]:
import pandas as pd 
import numpy as np
from tqdm.auto import tqdm
import torch 
import torch.nn as nn
import torch.optim as optim 
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer , DistilBertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from transformers import BertModel, BertTokenizer
import torch.nn as nn 
from torch.utils.data  import Dataset
from torch.utils.data import Dataset, DataLoader , TensorDataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [3]:
class SubjectDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.maxlen = 256
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        sample_title = str(self.df['title'].iloc[index])
        sample_content = str(self.df['text'].iloc[index])
        sample = sample_title + " " + sample_content 
        
        encodings = self.tokenizer.encode_plus(
            sample,
            add_special_tokens=True,
            max_length=self.maxlen,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt' 
        )
        
        label = torch.tensor(self.df['label'].iloc[index], dtype=torch.long)
        
        return {
            'input_ids': encodings['input_ids'].flatten(),  
            'attention_mask': encodings['attention_mask'].flatten(),
            'labels': label
        }

In [4]:
data = pd.read_csv("/kaggle/input/the-liar-dataset/data.csv")
data.head()

Unnamed: 0,title,text,subject,date,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",0
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",0
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",0
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",1
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",0


In [5]:
# Method 1: Using kagglehub
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Specify the correct file path - the main CSV file in the dataset
file_path = "data.csv"  # This is typically the main file, adjust if needed

# Load the dataset
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "omarelmoujahid/the-liar-dataset",
    file_path
)

print("First 5 records:")
print(df.head())

  df = kagglehub.load_dataset(


First 5 records:
                                               title  \
0   BREAKING: GOP Chairman Grassley Has Had Enoug...   
1   Failed GOP Candidates Remembered In Hilarious...   
2   Mike Pence’s New DC Neighbors Are HILARIOUSLY...   
3  California AG pledges to defend birth control ...   
4  AZ RANCHERS Living On US-Mexico Border Destroy...   

                                                text       subject  \
0  Donald Trump s White House is in chaos, and th...          News   
1  Now that Donald Trump is the presumptive GOP n...          News   
2  Mike Pence is a huge homophobe. He supports ex...          News   
3  SAN FRANCISCO (Reuters) - California Attorney ...  politicsNews   
4  Twisted reasoning is all that comes from Pelos...      politics   

               date  label  
0     July 21, 2017      0  
1       May 7, 2016      0  
2  December 3, 2016      0  
3  October 6, 2017       1  
4      Apr 25, 2017      0  


In [6]:
def get_bert_embeddings(dataset, bert_model, device):
    embeddings = [] # Will store BERT embeddings for each text
    labels = [] ## for labels
    bert_model = bert_model.to(device)
    bert_model.eval()
    
    loader = DataLoader(dataset, batch_size=32, shuffle=False)
    # Disable gradient calculations since we're only doing inference not classiftion 
    with torch.no_grad():
        for batch in tqdm(loader, desc="Getting BERT embeddings"):
            input_ids = batch['input_ids'].to(device) ## batch to dvice [batch = 16 in this case _size, 256]
            attention_mask = batch['attention_mask'].to(device) ## same here for attention_mask
            
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            # Get [CLS] token embeddings
            # Extract [CLS] token embeddings
            # outputs.last_hidden_state shape: [batch_size, 256, 768]
            # [:, 0, :] selects the first token ([CLS]) of each sequence
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.extend(batch_embeddings)
            labels.extend(batch['labels'].cpu().numpy())
        # Convert lists to numpy arrays
    return np.array(embeddings), np.array(labels)

In [7]:
def generate_bert_embeddings(df_path, model_name='bert-base-uncased'):
    # Chargement des données
    df = pd.read_csv(df_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Split des données
    df_train, df_test = train_test_split(df, train_size=0.8, random_state=42)
    
    # Chargement du modèle BERT
    bert_model = BertModel.from_pretrained(model_name)
    
    # Génération des embeddings
    print("Génération des embeddings pour l'ensemble d'entraînement")
    X_train, y_train = get_bert_embeddings(SubjectDataset(df_train), bert_model, device)
    
    print("Génération des embeddings pour l'ensemble de test")
    X_test, y_test = get_bert_embeddings(SubjectDataset(df_test), bert_model, device)
    
    # Sauvegarde des embeddings
    np.save('X_train_embeddings.npy', X_train)
    np.save('y_train_labels.npy', y_train)
    np.save('X_test_embeddings.npy', X_test)
    np.save('y_test_labels.npy', y_test)

    # Sauvegarder le modèle BERT
    bert_model.save_pretrained('./bert_model')
    return X_train, y_train, X_test, y_test ,bert_model

In [8]:
class CNNClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, activation='sigmoid', dropout_rate=0.2):
        super(CNNClassifier, self).__init__()
        
        # Choose activation function
        if activation == 'sigmoid':
            act_func = nn.Sigmoid()
        else:  # ReLU
            act_func = nn.ReLU()
        
        # Define network architecture
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 512),
            act_func,
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            act_func,
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        return self.layers(x)

In [9]:
def train_cnn_classifier(X_train, y_train, X_test, y_test , num_classes):
    # Hyperparameter combinations as specified
    hyperparameters = {
        'activation': ['sigmoid', 'ReLU'],
        'batch_size': [64, 128, 512],
        'epochs': [5, 20, 100],
        'learning_rate': [0.001]
    }
    
    # Best metrics tracking
    best_accuracy = 0
    best_model = None
    best_params = None
    
    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Grid search
    for activation in hyperparameters['activation']:
        for batch_size in hyperparameters['batch_size']:
            for epochs in hyperparameters['epochs']:
                # Prepare data
                X_train_tensor = torch.FloatTensor(X_train)
                y_train_tensor = torch.LongTensor(y_train)
                X_test_tensor = torch.FloatTensor(X_test)
                y_test_tensor = torch.LongTensor(y_test)
                
                # Create data loaders
                train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                
                # Initialize model
                model = CNNClassifier(
                    input_dim=X_train.shape[1], 
                    num_classes=num_classes, 
                    activation=activation,
                    dropout_rate=0.2
                )
                model.to(device)
                
                # Loss and optimizer
                criterion = nn.CrossEntropyLoss()
                ## adam optimazer 
                optimizer = optim.Adam(model.parameters(), lr=0.001)
                
                # Training loop
                model.train()
                for epoch in range(epochs):
                    total_loss = 0
                    for batch_x, batch_y in train_loader:
                        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                        
                        # Zero the parameter gradients
                        optimizer.zero_grad()
                        
                        # Forward pass
                        outputs = model(batch_x)
                        loss = criterion(outputs, batch_y)
                        
                        # Backward pass and optimize
                        loss.backward()
                        optimizer.step()
                        
                        total_loss += loss.item()
                
                # Evaluation
                model.eval()
                with torch.no_grad():
                    test_outputs = model(X_test_tensor.to(device))
                    _, y_pred = torch.max(test_outputs, 1)
                    y_pred = y_pred.cpu().numpy()
                
                # Compute accuracy
                accuracy = np.mean(y_pred == y_test)
                
                # Update best model if current is better
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_model = model
                    best_params = {
                        'activation': activation,
                        'batch_size': batch_size,
                        'epochs': epochs
                    }
                
                print(f"Params: {best_params}, Accuracy: {accuracy:.4f}")
    
    # Final evaluation of best model
    print("\nBest Hyperparameters:")
    print(best_params)
    print(f"\nBest Accuracy: {best_accuracy:.4f}")
    
    # Compute full classification report
    best_model.eval()
    with torch.no_grad():
        y_pred = best_model(X_test_tensor.to(device)).argmax(dim=1).cpu().numpy()
    
    print("\nPerformance on test set:")
    print(classification_report(y_test, y_pred))
    
    # Save the best model
     # Sauvegarder le meilleur modèle CNN
    torch.save({
        'model_state_dict': best_model.state_dict(),
        'params': best_params,
        'input_dim': X_train.shape[1],
        'num_classes': num_classes
    }, 'cnn_classifier.pth')
    
    return best_model

In [15]:
def main():
    # Chemin vers votre fichier CSV
    df_path = "/kaggle/input/the-liar-dataset/data.csv"
    
    # Générer les embeddings et sauvegarder les modèles
    X_train, y_train, X_test, y_test, bert_model = generate_bert_embeddings(df_path)
    
    # Entraîner le classificateur CNN
    cnn_model = train_cnn_classifier(X_train, y_train, X_test, y_test ,2 )
    
    return cnn_model

In [16]:
if __name__ == "__main__":
    cnn_model = main()

Génération des embeddings pour l'ensemble d'entraînement


Getting BERT embeddings:   0%|          | 0/1123 [00:00<?, ?it/s]

Génération des embeddings pour l'ensemble de test


Getting BERT embeddings:   0%|          | 0/281 [00:00<?, ?it/s]

Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 5}, Accuracy: 0.9909
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 20}, Accuracy: 0.9953
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 100}, Accuracy: 0.9958
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 100}, Accuracy: 0.9918
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 100}, Accuracy: 0.9939
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 100}, Accuracy: 0.9941
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 100}, Accuracy: 0.9879
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 100}, Accuracy: 0.9930
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 100}, Accuracy: 0.9927
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 100}, Accuracy: 0.9844
Params: {'activation': 'sigmoid', 'batch_size': 64, 'epochs': 100}, Accuracy: 0.9921
Params: {'activation': 'ReLU', 'batch_size': 64, 'epochs': 100}, Acc

In [17]:
def load_models():
    """
    Fonction pour charger les modèles sauvegardés
    """
    # Charger le modèle BERT
    bert_model = BertModel.from_pretrained('./bert_model')
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    # Charger le modèle CNN
    cnn_checkpoint = torch.load('cnn_classifier.pth')
    cnn_model = CNNClassifier(
        input_dim=cnn_checkpoint['input_dim'], 
        num_classes=cnn_checkpoint['num_classes'], 
        activation=cnn_checkpoint['params']['activation']
    )
    cnn_model.load_state_dict(cnn_checkpoint['model_state_dict'])
    
    return bert_model, tokenizer, cnn_model

In [18]:
bert_model, tokenizer, cnn_model = load_models()

  cnn_checkpoint = torch.load('cnn_classifier.pth')


In [19]:
def classify_text(text, bert_model, tokenizer, cnn_model, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Move models to device
    bert_model = bert_model.to(device)
    cnn_model = cnn_model.to(device)
    
    # Set models to evaluation mode
    bert_model.eval()
    cnn_model.eval()
    
    # Tokenize the input text
    max_len = 256
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Get tensors
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Get BERT embeddings
    with torch.no_grad():
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        # Extract [CLS] token embedding
        embedding = outputs.last_hidden_state[:, 0, :].cpu()
    
    # Pass embeddings through CNN classifier
    with torch.no_grad():
        embedding = embedding.to(device)
        logits = cnn_model(embedding)
        prediction = torch.argmax(logits, dim=1).item()
    
    # Map prediction to label
    return "True News" if prediction == 1 else "Fake News"

def test_news_classifier(test_texts, bert_model, tokenizer, cnn_model):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    true_labels = []
    predicted_labels = []
    
    for text, label in test_texts:
        prediction = classify_text(text, bert_model, tokenizer, cnn_model, device)
        true_labels.append(label)
        predicted_labels.append(prediction)
        print(f"Text: {text[:100]}...\nTrue Label: {label}\nPrediction: {prediction}\n")
    
    # Calculate Accuracy
    accuracy = accuracy_score(true_labels, predicted_labels) * 100
    print(f"\n✅ Model Accuracy on Test Set: {accuracy:.2f}%")
    
    # Generate classification report
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels))
    
    return accuracy



# Example usage:
if __name__ == "__main__":
    # Load models
    bert_model, tokenizer, cnn_model = load_models()
    
    # Use the specified custom test texts
    custom_texts = [
        # True news examples
        ("Recent scientific research has confirmed that regular exercise improves cardiovascular health and reduces the risk of chronic diseases.", "True News"),
        ("The national weather service has issued a hurricane warning for coastal areas as the storm approaches.", "True News"),
        ("The government announced a new infrastructure plan that will invest in roads, bridges, and public transportation over the next decade.", "True News"),
        ("According to the latest economic report, unemployment rates have decreased by 0.5% in the last quarter.", "True News"),
        ("Scientists have discovered a new species of deep-sea creatures during an oceanographic expedition.", "True News"),
        
        # Fake news examples
        ("Scientists discover that drinking coffee mixed with lemon juice can cure all types of cancer within 24 hours.", "Fake News"),
        ("Government secretly installing mind control devices in all new smartphones to monitor citizens' thoughts.", "Fake News"),
        ("New study proves that the earth is actually flat and all space images are computer-generated by NASA.", "Fake News"),
        ("Experts reveal that dinosaurs never existed and all fossils were planted by a secret organization.", "Fake News"),
        ("Breaking: Celebrity found to be an alien from another galaxy according to leaked government documents.", "Fake News")
    ]
    
    # Test the classifier on the specified dataset
    accuracy = test_news_classifier(custom_texts, bert_model, tokenizer, cnn_model)

  cnn_checkpoint = torch.load('cnn_classifier.pth')


Text: Recent scientific research has confirmed that regular exercise improves cardiovascular health and re...
True Label: True News
Prediction: True News

True Label: True News
Prediction: True News

Text: The government announced a new infrastructure plan that will invest in roads, bridges, and public tr...
True Label: True News
Prediction: True News

Text: According to the latest economic report, unemployment rates have decreased by 0.5% in the last quart...
True Label: True News
Prediction: True News

Text: Scientists have discovered a new species of deep-sea creatures during an oceanographic expedition....
True Label: True News
Prediction: True News

Text: Scientists discover that drinking coffee mixed with lemon juice can cure all types of cancer within ...
True Label: Fake News
Prediction: Fake News

Text: Government secretly installing mind control devices in all new smartphones to monitor citizens' thou...
True Label: Fake News
Prediction: Fake News

Text: New study proves that

In [21]:
def classify_text(text, bert_model, tokenizer, cnn_model, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Move models to device
    bert_model = bert_model.to(device)
    cnn_model = cnn_model.to(device)
    
    # Set models to evaluation mode
    bert_model.eval()
    cnn_model.eval()
    
    # Tokenize the input text
    max_len = 256
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Get tensors
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Get BERT embeddings
    with torch.no_grad():
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        # Extract [CLS] token embedding
        embedding = outputs.last_hidden_state[:, 0, :].cpu()
    
    # Pass embeddings through CNN classifier
    with torch.no_grad():
        embedding = embedding.to(device)
        logits = cnn_model(embedding)
        prediction = torch.argmax(logits, dim=1).item()
    
    # Map prediction to label
    return "True News" if prediction == 1 else "Fake News"

def test_news_classifier(test_texts, bert_model, tokenizer, cnn_model):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    true_labels = []
    predicted_labels = []
    
    for text, label in test_texts:
        prediction = classify_text(text, bert_model, tokenizer, cnn_model, device)
        true_labels.append(label)
        predicted_labels.append(prediction)
        print(f"Text: {text[:100]}...\nTrue Label: {label}\nPrediction: {prediction}\n")
    
    # Calculate Accuracy
    accuracy = accuracy_score(true_labels, predicted_labels) * 100
    print(f"\n✅ Model Accuracy on Test Set: {accuracy:.2f}%")
    
    # Generate classification report
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels))
    
    return accuracy



# Example usage:
if __name__ == "__main__":
    # Load models
    bert_model, tokenizer, cnn_model = load_models()
    
    # Use the specified custom test texts
    cuscustom_texts = [
    # True news examples
    ("Medical researchers have developed a new vaccine that significantly reduces the spread of a seasonal flu virus.", "True News"),
    ("NASA successfully lands a new rover on Mars to explore signs of ancient life.", "True News"),
    ("The United Nations has launched a new initiative to combat climate change and promote sustainable energy.", "True News"),
    ("A recent study shows that a balanced diet and regular physical activity contribute to a longer lifespan.", "True News"),
    ("The national education board has announced new reforms to improve student learning outcomes.", "True News"),

    # Fake news examples
    ("Drinking salt water every morning will make you immune to all diseases, scientists confirm.", "Fake News"),
    ("Secret documents reveal that the moon landing was staged in a Hollywood studio.", "Fake News"),
    ("A new miracle drug allows people to live up to 200 years without aging.", "Fake News"),
    ("Aliens have taken over world governments and are secretly controlling major political decisions.", "Fake News"),
    ("Eating chocolate every day guarantees you will never gain weight, according to hidden ancient texts.", "Fake News")
    ]
    
    # Test the classifier on the specified dataset
    accuracy = test_news_classifier(custom_texts, bert_model, tokenizer, cnn_model)

  cnn_checkpoint = torch.load('cnn_classifier.pth')


Text: Recent scientific research has confirmed that regular exercise improves cardiovascular health and re...
True Label: True News
Prediction: True News

True Label: True News
Prediction: True News

Text: The government announced a new infrastructure plan that will invest in roads, bridges, and public tr...
True Label: True News
Prediction: True News

Text: According to the latest economic report, unemployment rates have decreased by 0.5% in the last quart...
True Label: True News
Prediction: True News

Text: Scientists have discovered a new species of deep-sea creatures during an oceanographic expedition....
True Label: True News
Prediction: True News

Text: Scientists discover that drinking coffee mixed with lemon juice can cure all types of cancer within ...
True Label: Fake News
Prediction: Fake News

Text: Government secretly installing mind control devices in all new smartphones to monitor citizens' thou...
True Label: Fake News
Prediction: Fake News

Text: New study proves that

In [22]:
advanced_test_texts = [
    # True news examples - more complex, with statistics and nuance
    ("A longitudinal study spanning 15 years has found correlations between air pollution levels and respiratory disease incidence in urban areas, with PM2.5 particles showing the strongest association according to the peer-reviewed publication in Environmental Health.", "True News"),
    
    ("The central bank raised interest rates by 0.25% yesterday, citing concerns about inflation which reached 3.8% in the previous quarter, exceeding analyst expectations of 3.2% according to economic data released by the treasury department.", "True News"),
    
    ("Researchers have identified a potential biomarker for early-stage Alzheimer's disease, though they caution that further studies with larger sample sizes are needed before clinical applications can be considered.", "True News"),
    
    ("The peace negotiations ended without formal agreement, but both parties committed to continuing talks next month. Diplomatic sources indicate that progress was made on three of the five key points of contention.", "True News"),
    
    ("The newly discovered exoplanet orbits within its star's habitable zone, but scientists emphasize that this alone doesn't confirm the presence of life or even water, as atmospheric composition data is still being analyzed.", "True News"),
    
    # Fake news examples - more sophisticated, including partial truths, misleading context
    ("Studies show that common household microwave ovens emit radiation that can alter DNA structure in foods, leading to increased cancer risk, according to independent researchers.", "Fake News"),
    
    ("An analysis of voting patterns reveals that 35% of ballots in key districts showed statistical anomalies consistent with algorithmic manipulation, based on expert examination of election data.", "Fake News"),
    
    ("A whistleblower from the pharmaceutical industry has revealed documents showing certain vaccines contain stabilizing compounds that have been linked to autoimmune disorders in preliminary studies not published in mainstream journals.", "Fake News"),
    
    ("Economic experts predict that the current monetary policy will lead to hyperinflation within 18 months, based on historical parallels with similar economic conditions in other countries.", "Fake News"),
    
    ("New evidence suggests that certain meditation techniques can permanently alter brain chemistry to eliminate the need for antidepressant medications, according to leading neuropsychologists.", "Fake News")
]

# To use this dataset with your existing code:
accuracy = test_news_classifier(advanced_test_texts, bert_model, tokenizer, cnn_model)

Text: A longitudinal study spanning 15 years has found correlations between air pollution levels and respi...
True Label: True News
Prediction: Fake News

Text: The central bank raised interest rates by 0.25% yesterday, citing concerns about inflation which rea...
True Label: True News
Prediction: True News

Text: Researchers have identified a potential biomarker for early-stage Alzheimer's disease, though they c...
True Label: True News
Prediction: True News

Text: The peace negotiations ended without formal agreement, but both parties committed to continuing talk...
True Label: True News
Prediction: True News

Text: The newly discovered exoplanet orbits within its star's habitable zone, but scientists emphasize tha...
True Label: True News
Prediction: True News

Text: Studies show that common household microwave ovens emit radiation that can alter DNA structure in fo...
True Label: Fake News
Prediction: Fake News

Text: An analysis of voting patterns reveals that 35% of ballots in ke

In [23]:
comprehensive_test_texts = [
    # TRUE NEWS: Straightforward factual reporting
    ("The stock market index fell 2.3% yesterday following the release of higher-than-expected inflation figures, marking its largest single-day decline in three months.", "True News"),
    ("A new species of deep-sea coral has been discovered at depths of over 2000 meters in the Pacific Ocean, according to marine biologists from the Oceanographic Institute.", "True News"),
    
    # TRUE NEWS: Complex scientific findings with appropriate caveats
    ("A meta-analysis of 42 clinical trials suggests that the new drug may reduce symptoms in approximately 65% of patients with the condition, though researchers caution that individual responses vary significantly.", "True News"),
    ("Climate scientists report that global temperatures have risen by an average of 1.1°C since pre-industrial times, with the rate of warming varying considerably by geographic region and season.", "True News"),
    
    # TRUE NEWS: Political reporting with balance
    ("The senate passed the infrastructure bill with a vote of 67-33, with supporters citing economic benefits while critics expressed concerns about the impact on the national debt.", "True News"),
    ("Diplomatic tensions have escalated between the two nations following disputed maritime border incidents, though representatives from both countries have agreed to UN-mediated talks next week.", "True News"),
    
    # TRUE NEWS: Reports with uncertainty acknowledged
    ("Preliminary archaeological findings suggest the settlement may date back to the 5th century, though carbon dating results are still pending and could alter this timeline.", "True News"),
    ("The economic forecast predicts moderate growth of 2.1-2.6% for the next fiscal year, contingent on stable energy prices and resolution of ongoing supply chain disruptions.", "True News"),
    
    # TRUE NEWS: Reports with statistics and specific attributions
    ("According to the World Health Organization's latest report, vaccination rates have increased by 12% globally, with the most significant improvements in Southeast Asian countries.", "True News"),
    ("The city council approved the budget by a margin of 7-2, allocating 34% to infrastructure, 28% to public safety, and 22% to education, according to official minutes released yesterday.", "True News"),
    
    # FAKE NEWS: Exaggerated claims with pseudo-precision
    ("Scientists have proven that consuming one tablespoon of this natural oil daily reverses aging at the cellular level by exactly 24.7%, based on telomere measurements.", "Fake News"),
    ("Statistical analysis shows with 99.8% certainty that weather patterns have been artificially manipulated using atmospheric technology deployed since 2018.", "Fake News"),
    
    # FAKE NEWS: Conspiracy theories with seeming logic
    ("Internal documents reveal that social media algorithms are specifically designed to induce dopamine addiction patterns identical to those used in gambling machines, as part of a coordinated effort to influence purchasing behaviors.", "Fake News"),
    ("A network of senior government officials has been systematically altering economic data since 2015 to hide evidence of an imminent financial collapse, according to multiple independent analysts.", "Fake News"),
    
    # FAKE NEWS: Misleading health claims
    ("Researchers have discovered that common food additives found in most processed foods directly contribute to the development of autoimmune disorders by disrupting gut barrier function within 30 days of regular consumption.", "Fake News"),
    ("A suppressed study from a prestigious medical university demonstrates that certain frequencies of electromagnetic radiation from everyday devices can alter brain chemistry, leading to increased susceptibility to suggestion.", "Fake News"),
    
    # FAKE NEWS: Political misinformation with partisan framing
    ("Analysis of voting records proves that 82% of decisions made by elected officials directly benefit their corporate donors rather than constituents, an unprecedented level of corruption confirmed by political scientists.", "Fake News"),
    ("Legal experts unanimously agree that recently passed legislation contains hidden provisions that will enable government agencies to monitor private communications without judicial oversight.", "Fake News"),
    
    # FAKE NEWS: Misleading use of real events or partial truths
    ("The recent power grid failures were actually a controlled test of emergency systems designed to prepare for the implementation of resource rationing measures planned for next year.", "Fake News"),
    ("The pharmaceutical company has been aware for over a decade that their bestselling medication causes the very condition it claims to treat, according to documents that regulators have repeatedly refused to investigate.", "Fake News"),
    
    # FAKE NEWS: Sensationalized environmental claims
    ("Marine biologists have confirmed that plastic particles in the ocean are evolving to form novel microstructures that will soon be impossible to remove from marine ecosystems.", "Fake News"),
    ("Environmental engineers have calculated that current resource consumption patterns will deplete all accessible fresh water within 12 years, a timeline that is being deliberately concealed from the public.", "Fake News"),
    
    # AMBIGUOUS/CHALLENGING: Opinion presented as fact
    ("The current administration's economic policies represent the greatest threat to financial stability in a generation and will inevitably lead to widespread economic hardship.", "Fake News"),
    ("The recent cultural shifts in entertainment media clearly demonstrate a coordinated attempt to reshape societal values according to a specific ideological framework.", "Fake News"),
    
    # AMBIGUOUS/CHALLENGING: Prediction presented as certainty
    ("Experts confirm that the housing market will collapse by at least 40% within the next 18 months based on current lending patterns that mirror the 2008 financial crisis.", "Fake News"),
    ("Analysis shows that the current diplomatic approach will inevitably fail to prevent regional conflict, as historical precedents demonstrate with absolute certainty.", "Fake News"),
    
    # AMBIGUOUS/CHALLENGING: Real but uncommon or surprising
    ("The rare astronomical alignment next month will create unusual gravitational effects that may be measurable in sensitive laboratory equipment, according to theoretical physicists.", "True News"),
    ("Researchers have successfully taught an AI system to accurately identify certain medical conditions from photographs with higher accuracy than experienced medical professionals in controlled tests.", "True News"),
    
    # AMBIGUOUS/CHALLENGING: Emerging science with limited evidence
    ("Early-stage research indicates that specific gut bacteria may influence neurochemical production in ways that affect mood and cognitive function, though clinical applications remain years away.", "True News"),
    ("Some quantum physicists propose that information may be preserved after passing through a black hole's event horizon, challenging conventional understanding of how these cosmic entities function.", "True News")
]

# To use this comprehensive dataset with your existing code:
accuracy = test_news_classifier(comprehensive_test_texts, bert_model, tokenizer, cnn_model)

Text: The stock market index fell 2.3% yesterday following the release of higher-than-expected inflation f...
True Label: True News
Prediction: True News

Text: A new species of deep-sea coral has been discovered at depths of over 2000 meters in the Pacific Oce...
True Label: True News
Prediction: True News

Text: A meta-analysis of 42 clinical trials suggests that the new drug may reduce symptoms in approximatel...
True Label: True News
Prediction: True News

Text: Climate scientists report that global temperatures have risen by an average of 1.1°C since pre-indus...
True Label: True News
Prediction: Fake News

Text: The senate passed the infrastructure bill with a vote of 67-33, with supporters citing economic bene...
True Label: True News
Prediction: True News

Text: Diplomatic tensions have escalated between the two nations following disputed maritime border incide...
True Label: True News
Prediction: True News

Text: Preliminary archaeological findings suggest the settlement may d