### Fake News Detection using Text Mining + ML

### Import Libraries & Load the Data

In [13]:
import pandas as pd

# Load both datasets
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

# Add labels manually
fake_df['label'] = 1  # 1 for Fake
true_df['label'] = 0  # 0 for Real

# Combine the datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the dataset (important for model training)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())


                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      1  
1       April 5, 2017       0  
2  September 27, 2017       0  
3         May 22, 2017      1  
4       June 24, 2016       0  


In [14]:
for index, row in df.head().iterrows():
    print(f"\nRow {index + 1}")
    print(f"Title   : {row['title']}")
    print(f"Text    : {row['text'][:100]}...")  # Limiting long text for readability
    print(f"Subject : {row['subject']}")
    print(f"Date    : {row['date']}")
    print(f"Label   : {'Fake' if row['label'] == 1 else 'Real'}")


Row 1
Title   : Ben Stein Calls Out 9th Circuit Court: Committed a ‘Coup d’état’ Against the Constitution
Text    : 21st Century Wire says Ben Stein, reputable professor from, Pepperdine University (also of some Holl...
Subject : US_News
Date    : February 13, 2017
Label   : Fake

Row 2
Title   : Trump drops Steve Bannon from National Security Council
Text    : WASHINGTON (Reuters) - U.S. President Donald Trump removed his chief strategist Steve Bannon from th...
Subject : politicsNews
Date    : April 5, 2017 
Label   : Real

Row 3
Title   : Puerto Rico expects U.S. to lift Jones Act shipping restrictions
Text    : (Reuters) - Puerto Rico Governor Ricardo Rossello said on Wednesday he expected the federal governme...
Subject : politicsNews
Date    : September 27, 2017 
Label   : Real

Row 4
Title   :  OOPS: Trump Just Accidentally Confirmed He Leaked Israeli Intelligence To Russia (VIDEO)
Text    : On Monday, Donald Trump once again embarrassed himself and his country by accidentally 

### Basic Cleaning & Preprocessing

In [2]:
# Fill any missing values (if any)
df['text'] = df['text'].fillna('')

# Combine title + text to get full content
df['content'] = df['title'] + " " + df['text']

# Final data for modeling
X = df['content']
y = df['label']


### Text Vectorization (TF-IDF)



In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Transform the text
X_vectorized = vectorizer.fit_transform(X)


### Train-Test Split

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)


### Model Training (Logistic Regression First)



In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.984521158129176
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4270
           1       0.99      0.98      0.99      4710

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



### Add a Function to Predict Custom News Input

In [6]:
# Function to check if a given news text is fake or real
def check_news(news_text):
    # Transform the text using the same TF-IDF vectorizer
    text_vector = vectorizer.transform([news_text])
    
    # Make prediction
    prediction = model.predict(text_vector)[0]

    # Display result
    if prediction == 1:
        print("🟥 This news is likely FAKE.")
    else:
        print("🟩 This news is likely REAL.")


### Try It with a Sample:

In [7]:
sample_news = """
World Leaders Announce New Climate Agreement to Cut Emissions by 50% Before 2030.
The United Nations praised the decision, citing it as a major step forward in the fight against climate change.
"""

check_news(sample_news)


🟩 This news is likely REAL.


### Let the User Input News Manually

In [9]:
user_input = input("Paste the news headline or article here:\n")
check_news(user_input)


Paste the news headline or article here:
customer segmentation books using data mining
🟥 This news is likely FAKE.


In [11]:
sample_news = """ The International Monetary Fund has approved a $182 million financing package to support Rwanda's economic development"""

check_news(sample_news)

🟩 This news is likely REAL.


In [12]:
import pickle

# Save model and vectorizer
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))