<a href="https://colab.research.google.com/github/manu14357/AI-Powered-Fake-News-Detection/blob/main/Ai_Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import csv

fake_news = pd.read_csv('/content/drive/MyDrive/fakenews/Fake_News.csv', on_bad_lines="skip")
true_news = pd.read_csv('/content/drive/MyDrive/fakenews/True_News.csv', on_bad_lines="skip")


# Check the first few rows of both datasets
print("Fake News Dataset:")
print(fake_news.head())

print("\nTrue News Dataset:")
print(true_news.head())

Fake News Dataset:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  

True News Dataset:
                                               title  \
0  As U.S. budget fight looms, Republicans fl

In [4]:
# Add a label column: 1 for fake news, 0 for true news
fake_news['label'] = 1
true_news['label'] = 0

# Combine both datasets
data = pd.concat([fake_news, true_news], ignore_index=True)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Check the combined dataset
print(data.head())

                                               title  \
0   A Panicked Trump Ran Around The White House A...   
1  U.S. business seeks action, not trade war, in ...   
2  COULD BEN CARSON BECOME AMERICA’S FIRST BLACK ...   
3  THAILAND THREATENS TO PROSECUTE FACEBOOK Over ...   
4  HILLARY LANDS COVETED Taxpayer Funded, Planned...   

                                                text       subject  \
0  This incredibly awkward story comes from New Y...          News   
1  BEIJING/WASHINGTON (Reuters) - Although worrie...  politicsNews   
2   Stop allowing the progressives to drive God o...      politics   
3  Embarrassing footage of Thailand s king wearin...      politics   
4  Sadly, this will be the only reason many women...      politics   

             date  label  
0   June 30, 2017      1  
1  April 4, 2017       0  
2    Sep 25, 2015      1  
3    May 15, 2017      1  
4     Jan 8, 2016      1  


In [5]:
import nltk
nltk.download('stopwords')
import re
from nltk.corpus import stopwords

# Load stopwords
stop_words = set(stopwords.words('english'))

# Function to clean the text
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

# Apply the cleaning function to the text column
data['text'] = data['text'].apply(clean_text)

# Check the cleaned data
print(data['text'].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0    incredibly awkward story comes new york post r...
1    beijing washington reuters although worried pr...
2    stop allowing progressives drive god land ben ...
3    embarrassing footage thailand king wearing cro...
4    sadly reason many women vote hillary crimes su...
Name: text, dtype: object


In [6]:
from sklearn.model_selection import train_test_split

# Split the dataset
X = data['text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

Training set size: 35918
Testing set size: 8980


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Transform the training and testing sets
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Shape of TF-IDF training data:", X_train_tfidf.shape)

Shape of TF-IDF training data: (35918, 5000)


In [11]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train_tfidf, y_train)

# Evaluate the model
accuracy = model.score(X_test_tfidf, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.99


In [12]:
from sklearn.metrics import confusion_matrix
import plotly.express as px
import pandas as pd

# Get predictions for the test set
y_pred = model.predict(X_test_tfidf)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_labels = ['Class 0', 'Class 1']  # Adjust as per your dataset's classes

# Create a DataFrame for Plotly
cm_df = pd.DataFrame(cm, index=cm_labels, columns=cm_labels)

# Plot the confusion matrix as a heatmap
fig = px.imshow(
    cm_df,
    text_auto=True,
    color_continuous_scale='Blues',
    title='Confusion Matrix',
    labels=dict(x="Predicted Label", y="True Label", color="Count")
)
fig.update_layout(xaxis_title="Predicted Label", yaxis_title="True Label")
fig.show()

In [13]:
import numpy as np

# Get feature names and their coefficients
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_.flatten()

# Create a DataFrame with terms and their coefficients
coef_df = pd.DataFrame({'Term': feature_names, 'Coefficient': coefficients})

# Sort by absolute coefficient values in descending order
coef_df = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)

# Select the top 20 terms (most impactful)
top_coef_df = coef_df.head(20)

# Plot using Plotly
fig = px.bar(
    top_coef_df,
    x='Term',
    y='Coefficient',
    title='Top 20 Features by Coefficient',
    labels={'Coefficient': 'Logistic Regression Coefficient'},
    text_auto=True
)
fig.update_layout(xaxis_title='Term', yaxis_title='Coefficient', xaxis_tickangle=45)
fig.show()

In [14]:
import plotly.graph_objects as go

# Plot a gauge chart to represent accuracy
fig = go.Figure(go.Indicator(
    mode="gauge+number",
    value=accuracy * 100,  # Convert to percentage
    title={'text': "Model Accuracy (%)"},
    gauge={
        'axis': {'range': [0, 100]},
        'bar': {'color': "blue"},
        'steps': [
            {'range': [0, 50], 'color': "red"},
            {'range': [50, 75], 'color': "orange"},
            {'range': [75, 100], 'color': "green"}
        ]
    }
))
fig.show()

In [15]:
from sklearn.metrics import classification_report

# Get predictions
y_pred = model.predict(X_test_tfidf)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4307
           1       0.99      0.98      0.99      4673

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [16]:
from sklearn.metrics import classification_report
import pandas as pd
import plotly.graph_objects as go

# Generate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Convert the classification report to a pandas DataFrame
report_df = pd.DataFrame(report).transpose()

# Plotly table to display the report
fig = go.Figure(data=[go.Table(
    header=dict(values=list(report_df.columns),
                fill_color='paleturquoise',
                align='center'),
    cells=dict(values=[report_df[col] for col in report_df.columns],
               fill_color='lavender',
               align='center'))
])

fig.update_layout(title="Classification Report")
fig.show()

In [17]:
import joblib

# Save the model
joblib.dump(model, 'fake_news_detector.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [18]:
# Load the saved model and vectorizer
loaded_model = joblib.load('fake_news_detector.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example prediction
new_article = ["This is an example news article."]
new_article_tfidf = loaded_vectorizer.transform(new_article)

# Predict
prediction = loaded_model.predict(new_article_tfidf)
print("Fake" if prediction == 1 else "True")

Fake


### **Logistic Regression**

In [20]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load the datasets
fake_news = pd.read_csv('/content/drive/MyDrive/fakenews/Fake_News.csv', on_bad_lines="skip")
true_news = pd.read_csv('/content/drive/MyDrive/fakenews/True_News.csv', on_bad_lines="skip")

# Display the first few rows of each dataset
print(fake_news.head())
print(true_news.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept t

In [21]:
# Add labels for fake and true news
fake_news['label'] = 0
true_news['label'] = 1

# Combine the datasets
data = pd.concat([fake_news, true_news], axis=0)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# Check for missing values
print(data.isnull().sum())

title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [22]:
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet') # Download the wordnet dataset

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'\W+', ' ', text)
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stopwords and lemmatize
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Apply the clean_text function to the dataset
data['clean_text'] = data['text'].apply(clean_text)

# Check the cleaned data
print(data[['text', 'clean_text']].head())

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                                text  \
0   I think we are in a real moment like Nixon ??...   
1  PARIS (Reuters) - Police said a security alert...   
2  The grieving daughter in law of Vice President...   
3  OTTAWA (Reuters) - U.S. Treasury Secretary Ste...   
4  GENEVA (Reuters) - The U.N. s freedom of speec...   

                                          clean_text  
0  think real moment like nixon trump dogwhistles...  
1  paris reuters police said security alert briti...  
2  grieving daughter law vice president joe biden...  
3  ottawa reuters u treasury secretary steven mnu...  
4  geneva reuters u n freedom speech expert said ...  


In [23]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [24]:
# Define features and labels
X = data['clean_text']
y = data['label']

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training size: {len(X_train)}, Testing size: {len(X_test)}")

Training size: 35918, Testing size: 8980


In [25]:
import plotly.graph_objects as go

# Training and testing sizes
train_size = len(X_train)
test_size = len(X_test)

# Create the bar chart
fig = go.Figure(data=[go.Bar(
    x=['Training Data', 'Testing Data'],
    y=[train_size, test_size],
    marker_color=['blue', 'orange']
)])

fig.update_layout(
    title="Training vs Testing Data Size",
    xaxis_title="Data Type",
    yaxis_title="Number of Samples",
    showlegend=False
)

fig.show()

In [26]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.20.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
import gradio as gr

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load datasets
fake_news = pd.read_csv('/content/drive/MyDrive/fakenews/Fake_News.csv', on_bad_lines="skip")
true_news = pd.read_csv('/content/drive/MyDrive/fakenews/True_News.csv', on_bad_lines="skip")

# Add labels and combine datasets
fake_news['label'] = 0
true_news['label'] = 1
data = pd.concat([fake_news, true_news], axis=0).sample(frac=1).reset_index(drop=True)

# Text preprocessing function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)   # Remove numbers
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

data['clean_text'] = data['text'].apply(clean_text)

# Prepare features and split data
X = data['clean_text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Save model and vectorizer with specific names
joblib.dump(model, 'logistic_regression.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.9884187082405346

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4656
           1       0.98      0.99      0.99      4324

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [7]:
# Gradio prediction function
def predict_news(title, text, subject, date):
    combined_text = f"{title} {text} {subject} {date}"
    cleaned_text = clean_text(combined_text)
    input_data = tfidf.transform([cleaned_text])
    prediction = model.predict(input_data)[0]
    return "Fake News" if prediction == 0 else "Real News"

# Create Gradio interface
interface = gr.Interface(
    fn=predict_news,
    inputs=[
        gr.Textbox(label="Title", placeholder="Enter news title..."),
        gr.Textbox(lines=5, label="Content", placeholder="Paste news content..."),
        gr.Textbox(label="Subject", placeholder="Politics, Technology, etc..."),
        gr.Textbox(label="Date", placeholder="YYYY-MM-DD")
    ],
    outputs=gr.Label(label="Prediction"),
    title="Logistic Regression Fake News Detector",
    description="Detect fake news using Logistic Regression model"
)

# Launch the app
interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ec0126063ee1a57389.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# **New Version** **AI Explanation**
**meta/llama-3.3-70b-instruct**

In [29]:
!pip install gradio joblib requests openai



In [32]:
import gradio as gr
import joblib
import requests

# Load the trained model and the TF-IDF vectorizer
model = joblib.load('fake_news_detector.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# NVIDIA API endpoint and headers
NVIDIA_API_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
NVIDIA_API_KEY = "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"  # Replace with your NVIDIA API Key

# Combined prediction and explanation function
def predict_news(title, text, subject, date, url):
    combined_text = f"Title: {title}\nText: {text}\nSubject: {subject}\nDate: {date}\nURL: {url}"

    # Local model prediction (Final Prediction)
    input_data = vectorizer.transform([combined_text])
    local_prediction = model.predict(input_data)
    final_prediction = "Fake News" if local_prediction == 1 else "Real News"

    # Query NVIDIA's model for prediction and explanation
    nvidia_payload = {
        "model": "meta/llama-3.3-70b-instruct",
        "messages": [
            {
                "role": "system",
                "content": "You are a fact-checking AI assistant. Analyze the news content and classify it as Fake or Real, providing a detailed explanation."
            },
            {
                "role": "user",
                "content": combined_text
            }
        ]
    }
    headers = {"Authorization": f"Bearer {NVIDIA_API_KEY}"}
    try:
        response = requests.post(NVIDIA_API_URL, json=nvidia_payload, headers=headers)
        response.raise_for_status()
        nvidia_result = response.json()["choices"][0]["message"]["content"]
        if "fake" in nvidia_result.lower():
            ai_model_prediction = "Fake News"
        else:
            ai_model_prediction = "Real News"
    except Exception as e:
        ai_model_prediction = "Unknown (AI Model Error)"
        nvidia_result = f"Error while fetching explanation: {str(e)}"

    # Generate the final result
    result = (
        f"Model Prediction (Final Prediction): {final_prediction}\n\n"
        f"AI Model Prediction/Explanation:\n{nvidia_result}"
    )
    return result

# Define the Gradio interface
interface = gr.Interface(
    fn=predict_news,
    inputs=[
        gr.Textbox(label="Title of the news"),
        gr.Textbox(lines=10, label="Text of the news"),
        gr.Textbox(label="Subject of the news"),
        gr.Textbox(label="Date of the news"),
        gr.Textbox(label="URL of the news (optional)")
    ],
    outputs="text",
    title="Advanced Fake News Detector",
    description=(
        "Input the title, text, subject, and date of the news to predict whether it's real or fake. "
        "The system uses both a trained model and advanced AI for a comprehensive analysis."
    )
)

# Launch the interface
interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a3737505e822167dd2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# **Random Forest Classifier**

In [34]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Load datasets
fake_news = pd.read_csv('/content/drive/MyDrive/fakenews/Fake_News.csv', on_bad_lines="skip")
true_news = pd.read_csv('/content/drive/MyDrive/fakenews/True_News.csv', on_bad_lines="skip")


# Add labels: 0 for Fake, 1 for True
fake_news['label'] = 0
true_news['label'] = 1

# Combine the datasets
data = pd.concat([fake_news, true_news], axis=0)
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the data

# Check for missing values and drop rows with any
data = data.dropna()

# Clean the text data
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove special characters, numbers, and punctuations
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize, remove stopwords, and lemmatize
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

data['clean_text'] = data['text'].apply(clean_text)

# Define features (X) and labels (y)
X = data['clean_text']
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Build the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the trained model and vectorizer
joblib.dump(model, 'random_forest_fake_news_detector.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Test the model with new data
def predict_news(title, text, subject):
    combined_text = f"{title} {text} {subject}"
    input_data = vectorizer.transform([combined_text])
    prediction = model.predict(input_data)
    return "Fake News" if prediction == 0 else "Real News"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.9976614699331848

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4766
           1       1.00      1.00      1.00      4214

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [35]:
# Example usage
title = "Breaking News"
text = "This is a sample news text to check if the news is fake or real."
subject = "Politics"
print(predict_news(title, text, subject))

Fake News


In [36]:
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plotting the confusion matrix using Plotly
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=['Predicted Fake', 'Predicted Real'],
    y=['Actual Fake', 'Actual Real'],
    colorscale='Viridis',
    colorbar=dict(title='Count')
))

fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Labels",
    yaxis_title="Actual Labels"
)

fig.show()

In [37]:
import plotly.express as px

# Get feature importances
importances = model.feature_importances_

# Create a DataFrame for feature importances
features = vectorizer.get_feature_names_out()
feature_importance_df = pd.DataFrame({'feature': features, 'importance': importances})

# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# Create a bar plot using Plotly Express
fig = px.bar(feature_importance_df.head(20), x='feature', y='importance',
             title="Top 20 Important Features in Predicting Fake News",
             labels={'feature': 'Features', 'importance': 'Importance'},
             color='importance', color_continuous_scale='Viridis')

fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_title="Features",
    yaxis_title="Importance"
)

fig.show()


# **ANN**

In [38]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load datasets
fake_news = pd.read_csv('/content/drive/MyDrive/fakenews/Fake_News.csv', on_bad_lines="skip")
true_news = pd.read_csv('/content/drive/MyDrive/fakenews/True_News.csv', on_bad_lines="skip")


# Add labels: 0 for Fake, 1 for True
fake_news['label'] = 0
true_news['label'] = 1

# Combine the datasets
data = pd.concat([fake_news, true_news], axis=0)
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the data

# Clean the text data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove special characters, numbers, and punctuations
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize, remove stopwords, and lemmatize
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

data['clean_text'] = data['text'].apply(clean_text)

# Define features (X) and labels (y)
X = data['clean_text']
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

# Build the ANN Model
model = Sequential()
model.add(Dense(128, input_dim=X_train_tfidf.shape[1], activation='relu'))  # Input Layer
model.add(Dropout(0.3))  # Dropout to reduce overfitting
model.add(Dense(64, activation='relu'))  # Hidden Layer 1
model.add(Dropout(0.3))  # Dropout Layer
model.add(Dense(1, activation='sigmoid'))  # Output Layer

# Compile the Model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Add Early Stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the Model
history = model.fit(
    X_train_tfidf, y_train,
    validation_split=0.2,
    epochs=1,
    batch_size=64,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the Model
y_pred = (model.predict(X_test_tfidf) > 0.5).astype(int)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the Model and Vectorizer
model.save('ann_fake_news_detector.h5')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Load the Model and Make Predictions
from tensorflow.keras.models import load_model

def predict_news_ann(title, text, subject):
    combined_text = f"{title} {text} {subject}"
    input_data = vectorizer.transform([combined_text]).toarray()
    loaded_model = load_model('ann_fake_news_detector.h5')
    prediction = loaded_model.predict(input_data)
    return "Fake News" if prediction[0][0] > 0.5 else "Real News"


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.9139 - loss: 0.2569 - val_accuracy: 0.9857 - val_loss: 0.0440
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step





Accuracy: 0.987305122494432

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4703
           1       0.99      0.98      0.99      4277

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [39]:
# Example Usage
title = "Breaking News"
text = "This is an example news article."
subject = "Politics"
print(predict_news_ann(title, text, subject))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Real News


In [40]:
import plotly.graph_objects as go

# Extract loss and accuracy from the history object
loss = history.history['loss']
val_loss = history.history['val_loss']
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Plotting the loss curve
fig_loss = go.Figure()
fig_loss.add_trace(go.Scatter(x=list(range(1, len(loss) + 1)), y=loss, mode='lines', name='Training Loss'))
fig_loss.add_trace(go.Scatter(x=list(range(1, len(val_loss) + 1)), y=val_loss, mode='lines', name='Validation Loss'))
fig_loss.update_layout(title="Training and Validation Loss", xaxis_title="Epochs", yaxis_title="Loss")
fig_loss.show()

# Plotting the accuracy curve
fig_accuracy = go.Figure()
fig_accuracy.add_trace(go.Scatter(x=list(range(1, len(accuracy) + 1)), y=accuracy, mode='lines', name='Training Accuracy'))
fig_accuracy.add_trace(go.Scatter(x=list(range(1, len(val_accuracy) + 1)), y=val_accuracy, mode='lines', name='Validation Accuracy'))
fig_accuracy.update_layout(title="Training and Validation Accuracy", xaxis_title="Epochs", yaxis_title="Accuracy")
fig_accuracy.show()

In [41]:
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plotting the confusion matrix
fig_cm = go.Figure(data=go.Heatmap(
    z=cm,
    x=['Predicted Fake', 'Predicted Real'],
    y=['Actual Fake', 'Actual Real'],
    colorscale='Viridis',
    colorbar=dict(title='Count')
))

fig_cm.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Labels",
    yaxis_title="Actual Labels"
)

fig_cm.show()

# **LSTM**

In [46]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
import joblib

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load datasets
fake_news = pd.read_csv('/content/drive/MyDrive/fakenews/Fake_News.csv', on_bad_lines="skip")
true_news = pd.read_csv('/content/drive/MyDrive/fakenews/True_News.csv', on_bad_lines="skip")

# Add labels: 0 for Fake, 1 for True
fake_news['label'] = 0
true_news['label'] = 1

# Combine the datasets
data = pd.concat([fake_news, true_news], axis=0)
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the data

# Text cleaning function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize, remove stopwords, and lemmatize
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Create combined text feature
data['combined_text'] = data['title'].fillna('') + " " + data['text'].fillna('') + " " + data['subject'].fillna('')
data['clean_text'] = data['combined_text'].apply(clean_text)

# Prepare data
X = data['clean_text']
y = data['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Save tokenizer for inference
joblib.dump(tokenizer, 'lstm_tokenizer.pkl')

# Convert text to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_len = 512  # Increased length to capture more context
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post', truncating='post')

# Build the LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=256, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.4),
    Bidirectional(LSTM(64)),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Add early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_padded, y_train,
    validation_split=0.2,
    epochs=2,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"\nTest Accuracy: {accuracy:.4f}")

# Save the model
model.save('lstm_fake_news_detector.h5')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/2



Argument `input_length` is deprecated. Just remove it.



[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3137s[0m 3s/step - accuracy: 0.9614 - loss: 0.0745 - val_accuracy: 0.9981 - val_loss: 0.0181
Epoch 2/2
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3151s[0m 3s/step - accuracy: 0.9989 - loss: 0.0077 - val_accuracy: 0.9972 - val_loss: 0.0164
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 802ms/step - accuracy: 0.9991 - loss: 0.0056





Test Accuracy: 0.9987


In [49]:
def predict_news_lstm(title, text, subject):
    combined_text = f"{title} {text} {subject}"
    cleaned_text = clean_text(combined_text)

    # Load tokenizer and model
    tokenizer = joblib.load('lstm_tokenizer.pkl')
    loaded_model = load_model('lstm_fake_news_detector.h5')

    # Preprocess input
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(
        sequence,
        maxlen=512,
        padding='post',
        truncating='post'  # Fixed parameter name
    )

    # Predict
    prediction = loaded_model.predict(padded_sequence)
    return "Fake News" if prediction[0][0] > 0.5 else "Real News"

# Example usage
title = "Breaking News"
text = "This is an example news article."
subject = "Politics"
print(predict_news_lstm(title, text, subject))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Real News


# **Hybrid Model**

In [9]:
import gradio as gr
import joblib
import numpy as np
import requests
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load all models and preprocessors
log_reg = joblib.load('logistic_regression.pkl')
rf_model = joblib.load('random_forest_fake_news_detector.pkl')
ann_model = load_model('ann_fake_news_detector.h5')
lstm_model = load_model('lstm_fake_news_detector.h5')
tfidf = joblib.load('tfidf_vectorizer.pkl')
tokenizer = joblib.load('tokenizer.pkl')

# NVIDIA API configuration
NVIDIA_API_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
NVIDIA_API_KEY = "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"

def preprocess_text(text):
    # Add your text cleaning/preprocessing steps here
    return text

def hybrid_prediction(title, text, subject, date, url):
    combined_text = f"{title} {text} {subject} {date} {url}"
    clean_text = preprocess_text(combined_text)

    # Get predictions from all models
    tfidf_vec = tfidf.transform([clean_text])

    log_pred = log_reg.predict_proba(tfidf_vec)[0][1]
    rf_pred = rf_model.predict_proba(tfidf_vec)[0][1]

    # ANN prediction
    ann_vec = tfidf.transform([clean_text]).toarray()
    ann_pred = ann_model.predict(ann_vec)[0][0]

    # LSTM prediction
    seq = tokenizer.texts_to_sequences([clean_text])
    padded = pad_sequences(seq, maxlen=200, padding='post')
    lstm_pred = lstm_model.predict(padded)[0][0]

    # Hybrid prediction (weighted average)
    final_prob = (log_pred * 0.2 +
                 rf_pred * 0.3 +
                 ann_pred * 0.25 +
                 lstm_pred * 0.25)

    final_pred = "Fake News" if final_prob > 0.5 else "Real News"

    # Get AI explanation
    try:
        response = requests.post(
            NVIDIA_API_URL,
            headers={"Authorization": f"Bearer {NVIDIA_API_KEY}"},
            json={
                "model": "meta/llama-3.3-70b-instruct",
                "messages": [
                    {"role": "system", "content": "You are a fact-checking AI. Analyze this news and explain why it's fake or real"},
                    {"role": "user", "content": combined_text}
                ]
            }
        )
        explanation = response.json()['choices'][0]['message']['content']
    except:
        explanation = "Explanation service unavailable"

    return f"""
    **Final Prediction:** {final_pred} (Confidence: {final_prob:.2f})
    **Logistic Regression:** {log_pred:.2f}
    **Random Forest:** {rf_pred:.2f}
    **ANN:** {ann_pred:.2f}
    **LSTM:** {lstm_pred:.2f}

    **AI Explanation:**
    {explanation}
    """

# Advanced Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Advanced Hybrid Fake News Detector")
    with gr.Row():
        with gr.Column():
            title = gr.Textbox(label="News Title", placeholder="Enter news title...")
            text = gr.Textbox(lines=5, label="News Content", placeholder="Paste news content here...")
            subject = gr.Textbox(label="News Subject", placeholder="Politics, Technology, etc...")
            date = gr.Textbox(label="Publication Date", placeholder="YYYY-MM-DD")
            url = gr.Textbox(label="Source URL", placeholder="https://...")
            btn = gr.Button("Analyze News")
        with gr.Column():
            output = gr.Markdown(label="Analysis Results")

    btn.click(hybrid_prediction,
             inputs=[title, text, subject, date, url],
             outputs=output)

demo.launch()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://aad69dc1a824e1b4e0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


