In [2]:
import pandas as pd
import torch
import requests
from bs4 import BeautifulSoup
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cuda


In [3]:
# Load combined dataset
df = pd.read_csv("combined_train_with_ifnd.csv")

# Normalize and map labels to binary 0/1
df['label'] = df['label'].str.lower()

label_map = {
    'true': 1,
    'mostly-true': 1,
    'half-true': 0,
    'barely-true': 0,
    'false': 0,
    'pants-fire': 0
}

df['label'] = df['label'].map(label_map)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

print("Label distribution after mapping:")
print(df['label'].value_counts())


Label distribution after mapping:
label
1    41438
0    24098
Name: count, dtype: int64


  df = pd.read_csv("combined_train_with_ifnd.csv")


In [4]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['statement'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)


In [5]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Prepare Hugging Face Dataset format
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/52428 [00:00<?, ? examples/s]

Map:   0%|          | 0/13108 [00:00<?, ? examples/s]

In [6]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
).to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="accuracy"
)




In [8]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()


In [None]:
results = trainer.evaluate()
print(f"Evaluation Accuracy: {results['eval_accuracy']:.4f}")


In [None]:
model.save_pretrained("./distilbert-fake-news")
tokenizer.save_pretrained("./distilbert-fake-news")
print("✅ Model and tokenizer saved!")


In [None]:
classifier = pipeline(
    "text-classification",
    model="./distilbert-fake-news",
    tokenizer="./distilbert-fake-news",
    device=0 if torch.cuda.is_available() else -1
)

def predict_text(text):
    result = classifier(text)[0]
    label = "Reliable (True)" if result['label'] == 'LABEL_1' else "Unreliable (False)"
    return label, result['score']


In [None]:
def fetch_text_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = " ".join([p.get_text() for p in paragraphs])
        return text if text else "No text found."
    except Exception as e:
        return f"Error fetching URL: {e}"


In [None]:
def check_news():
    choice = input("\nDo you want to check (1) URL or (2) Text statement? Enter 1 or 2: ")

    if choice == "1":
        url = input("Enter URL: ")
        text = fetch_text_from_url(url)
        print(f"\nExtracted Text (first 500 chars):\n{text[:500]}...\n")
        label, score = predict_text(text)
    elif choice == "2":
        text = input("Enter statement: ")
        label, score = predict_text(text)
    else:
        print("Invalid choice.")
        return

    print(f"\n🧐 Prediction: {label} (Confidence: {score:.2f})")


In [None]:
check_news()


In [None]:
!pip install wikipedia

In [None]:
import requests
import wikipedia

# Replace with your Google Fact Check Tools API key
API_KEY = "AIzaSyDExyrxadfUpHV1sKYmkZS7DtdyhWKdC9s"

def check_google_fact_check(statement):
    url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={statement}&key={API_KEY}"
    response = requests.get(url)

    if response.status_code == 200:
        result = response.json()
        if "claims" in result:
            fact_checks = []
            for claim in result["claims"]:
                text = claim["text"]
                claimant = claim.get("claimant", "Unknown")
                publisher = claim["claimReview"][0]["publisher"]["name"]
                rating = claim["claimReview"][0]["textualRating"]

                fact_checks.append(f"🔸 Statement: {text}\n🔸 Claimed by: {claimant}\n🔸 Fact-Checked by: {publisher}\n🔸 Verdict: {rating}\n")

            return "\n".join(fact_checks)


def check_wikipedia(statement):
    try:
        search_results = wikipedia.search(statement, results=3)
        if search_results:
            summary = wikipedia.summary(search_results[0], sentences=2)
            return f"✅ Wikipedia Summary:\n{summary}"
    except Exception as e:
        return "❌ No relevant Wikipedia data found."

def check_fake_news(statement):
    print("\n🔹 Checking Fact Sources...")

    google_result = check_google_fact_check(statement)
    wiki_result = check_wikipedia(statement)

    print("\n🔹 Final Report:")
    print(f"\n{google_result}")
    print(f"\n{wiki_result}")

# Example Usage
news_statement = input("Enter a news statement to check: ")
check_fake_news(news_statement)