#### Data Collection
Load the dataset

In [9]:
import pandas as pd

In [10]:
# Load fake and real news datasets
df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")

In [11]:
# Assign labels (0 = Fake, 1 = Real)
df_fake['label'] = 0  
df_real['label'] = 1 

In [12]:
# Merge both datasets
df = pd.concat([df_fake, df_real], ignore_index=True)

In [13]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Shuffling the dataset is important, and here's why:

# Why Do We Shuffle the Dataset?
# Shuffling ensures that the model does not learn any unintended patterns from the dataset’s original order. If fake news articles appear first and real news articles later, the model might learn position-based patterns rather than actual differences in the text.

# What Does This Code Do?
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# sample(frac=1) → Selects 100% of the data but in a shuffled order.

# random_state=42 → Ensures reproducibility so the same shuffle happens every time.

# reset_index(drop=True) → Resets the index after shuffling (otherwise, the index values will be mixed).

# What Happens If You Don’t Shuffle?
# The model might see only one class at a time during training (e.g., first fake news, then real news), leading to biased learning.

# Training and test sets might have an uneven distribution of classes, affecting model performance.

# Conclusion: Shuffling is crucial to ensure fair and unbiased learning. Always shuffle when working with machine learning datasets. 

In [14]:
# Display dataset info
print(df.info())
print(df['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB
None
label
0    23481
1    21417
Name: count, dtype: int64


#### Text Processing
We will clean the text by removing stopwords, punctuation, and unnecessary symbols.

In [15]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Define the function BEFORE using it
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = " ".join(text.split())  # Remove extra spaces

    stop_words = set(stopwords.words('english'))
    text = " ".join(word for word in text.split() if word not in stop_words)

    return text


In [17]:
# Now apply it to the dataset
df["clean_text"] = df["text"].apply(clean_text)

In [18]:
# Check cleaned text
print(df.head())  

                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  \
0    February 13, 2017      0   
1       April 5, 2017       1   
2  September 27, 2017       1   
3         May 22, 2017      0   
4       June 24, 2016       1   

                                          c

#### Feature Engineering (Convert Text to Numbers)
We will use TF-IDF Vectorization to convert text into numerical format.

 Where is TF-IDF Used?
 
 The TF-IDF vectorizer (tfidf.transform([text])) converts the raw text into numerical features before feeding it into the models.

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Keep only top 5000 words
tfidf = TfidfVectorizer(max_features=5000)  

# Convert text into numerical form
X = tfidf.fit_transform(df["clean_text"])  

# Target (0 = Fake, 1 = Real)
y = df["label"]  

# Check dimensions
print(X.shape)  

(44898, 5000)


#### Train-Test Split
We will split the data into 80% training and 20% testing.

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Size:", X_train.shape)
print("Testing Size:", X_test.shape)

Training Size: (35918, 5000)
Testing Size: (8980, 5000)


#### Train ML Models
We will train 3 models:
 Logistic Regression,
 Random Forest and
 Naive Bayes.

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [22]:
# Train Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.9867483296213808
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4710
           1       0.98      0.99      0.99      4270

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [23]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9977728285077951
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4710
           1       1.00      1.00      1.00      4270

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [24]:
# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.9351893095768374
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      4710
           1       0.93      0.93      0.93      4270

    accuracy                           0.94      8980
   macro avg       0.93      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



#### Hyperparameter Tuning (Optional for Optimization)
If needed, we can tune hyperparameters to improve model accuracy.


In [25]:
# Example for Random Forest:

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None]}

# Grid Search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

Best Parameters: {'max_depth': None, 'n_estimators': 200}


#### Model Evaluation
We will evaluate models using Accuracy, Precision, Recall, and F1-score.

In [26]:
from sklearn.metrics import confusion_matrix

# Confusion Matrix for Logistic Regression
print("Confusion Matrix - Logistic Regression")
print(confusion_matrix(y_test, y_pred_lr))

# Confusion Matrix for Random Forest
print("Confusion Matrix - Random Forest")
print(confusion_matrix(y_test, y_pred_rf))

# Confusion Matrix for Naive Bayes
print("Confusion Matrix - Naive Bayes")
print(confusion_matrix(y_test, y_pred_nb))

Confusion Matrix - Logistic Regression
[[4635   75]
 [  44 4226]]
Confusion Matrix - Random Forest
[[4700   10]
 [  10 4260]]
Confusion Matrix - Naive Bayes
[[4408  302]
 [ 280 3990]]


In [28]:
import pandas as pd

# Load datasets
df_true = pd.read_csv("True.csv")
df_fake = pd.read_csv("Fake.csv")

# Display first few rows to verify
print("True News Data:")
display(df_true.head())

print("Fake News Data:")
display(df_fake.head())


True News Data:


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


Fake News Data:


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


####  Load the Required Libraries

In [30]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [31]:
import pickle

#### Load the Trained Model and Vectorizer
Load the TF-IDF vectorizer and Logistic Regression model (or whatever model you trained):

In [32]:
# Load saved model and vectorizer
tfidf = pickle.load(open("tfidf_vectorizer.pkl", "rb"))
lr_model = pickle.load(open("logistic_model.pkl", "rb"))
rf_model = pickle.load(open("random_forest_model.pkl", "rb"))
nb_model = pickle.load(open("naive_bayes_model.pkl", "rb"))

#### Define a Function to Predict Using All Models

In [33]:
def predict_news(text):
    # Convert the input text into TF-IDF features
    transformed_text = tfidf.transform([text])  
    
    # Predictions from different models
    lr_pred = lr_model.predict(transformed_text)[0]  # Logistic Regression
    rf_pred = rf_model.predict(transformed_text)[0]  # Random Forest
    nb_pred = nb_model.predict(transformed_text)[0]  # Naive Bayes

    return {
        "Logistic Regression": "Real News" if lr_pred == 1 else "Fake News",
        "Random Forest": "Real News" if rf_pred == 1 else "Fake News",
        "Naive Bayes": "Real News" if nb_pred == 1 else "Fake News"
    }


#### Test with a News Article

In [34]:
news_text = "The government has announced new policies for economic growth."
result = predict_news(news_text)

# Print results from all models
for model, prediction in result.items():
    print(f"{model}: {prediction}")


Logistic Regression: Fake News
Random Forest: Fake News
Naive Bayes: Real News
