In [6]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
     
    

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [10]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv(r"Fake_news_dataset.csv")
     

In [11]:
news_dataset.shape

(4000, 24)

In [12]:
# print the first 5 rows of the dataframe
news_dataset.head()
     

Unnamed: 0,id,title,author,text,state,date_published,source,category,sentiment_score,word_count,...,num_shares,num_comments,political_bias,fact_check_rating,is_satirical,trust_score,source_reputation,clickbait_score,plagiarism_score,label
0,1,Breaking News 1,Jane Smith,This is the content of article 1. It contains ...,Tennessee,30-11-2021,The Onion,Entertainment,-0.22,1302,...,47305,450,Center,FALSE,1,76,6,0.84,53.35,Fake
1,2,Breaking News 2,Emily Davis,This is the content of article 2. It contains ...,Wisconsin,02-09-2021,The Guardian,Technology,0.92,322,...,39804,530,Left,Mixed,1,1,5,0.85,28.28,Fake
2,3,Breaking News 3,John Doe,This is the content of article 3. It contains ...,Missouri,13-04-2021,New York Times,Sports,0.25,228,...,45860,763,Center,Mixed,0,57,1,0.72,0.38,Fake
3,4,Breaking News 4,Alex Johnson,This is the content of article 4. It contains ...,North Carolina,08-03-2020,CNN,Sports,0.94,155,...,34222,945,Center,TRUE,1,18,10,0.92,32.2,Fake
4,5,Breaking News 5,Emily Davis,This is the content of article 5. It contains ...,California,23-03-2022,Daily Mail,Technology,-0.01,962,...,35934,433,Right,Mixed,0,95,6,0.66,77.7,Real


In [13]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

id                   0
title                0
author               0
text                 0
state                0
date_published       0
source               0
category             0
sentiment_score      0
word_count           0
char_count           0
has_images           0
has_videos           0
readability_score    0
num_shares           0
num_comments         0
political_bias       0
fact_check_rating    0
is_satirical         0
trust_score          0
source_reputation    0
clickbait_score      0
plagiarism_score     0
label                0
dtype: int64

In [14]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')
     

In [15]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [16]:
print(news_dataset['content'])

0            Jane Smith Breaking News 1
1           Emily Davis Breaking News 2
2              John Doe Breaking News 3
3          Alex Johnson Breaking News 4
4           Emily Davis Breaking News 5
                     ...               
3995        John Doe Breaking News 3996
3996    Alex Johnson Breaking News 3997
3997    Alex Johnson Breaking News 3998
3998        John Doe Breaking News 3999
3999        John Doe Breaking News 4000
Name: content, Length: 4000, dtype: object


In [17]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [18]:
print(X)
print(Y)
     

        id               title        author  \
0        1     Breaking News 1    Jane Smith   
1        2     Breaking News 2   Emily Davis   
2        3     Breaking News 3      John Doe   
3        4     Breaking News 4  Alex Johnson   
4        5     Breaking News 5   Emily Davis   
...    ...                 ...           ...   
3995  3996  Breaking News 3996      John Doe   
3996  3997  Breaking News 3997  Alex Johnson   
3997  3998  Breaking News 3998  Alex Johnson   
3998  3999  Breaking News 3999      John Doe   
3999  4000  Breaking News 4000      John Doe   

                                                   text           state  \
0     This is the content of article 1. It contains ...       Tennessee   
1     This is the content of article 2. It contains ...       Wisconsin   
2     This is the content of article 3. It contains ...        Missouri   
3     This is the content of article 4. It contains ...  North Carolina   
4     This is the content of article 5. It conta

In [19]:
port_stem = PorterStemmer()

In [20]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content
     

In [21]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [22]:
print(news_dataset['content'])

0         jane smith break news
1         emili davi break news
2           john doe break news
3       alex johnson break news
4         emili davi break news
                 ...           
3995        john doe break news
3996    alex johnson break news
3997    alex johnson break news
3998        john doe break news
3999        john doe break news
Name: content, Length: 4000, dtype: object


In [23]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values
     

In [25]:
print(X)

['jane smith break news' 'emili davi break news' 'john doe break news' ...
 'alex johnson break news' 'john doe break news' 'john doe break news']


In [26]:
print(Y)
     

['Fake' 'Fake' 'Fake' ... 'Fake' 'Real' 'Real']


In [27]:
Y.shape

(4000,)

In [28]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)
     

In [29]:
print(X)
     

  (0, 1)	0.2537535642970132
  (0, 6)	0.6600069155748003
  (0, 10)	0.2537535642970132
  (0, 11)	0.6600069155748003
  (1, 1)	0.2513177760238013
  (1, 3)	0.6609382538894616
  (1, 5)	0.6609382538894616
  (1, 10)	0.2513177760238013
  (2, 1)	0.2513177760238013
  (2, 4)	0.6609382538894616
  (2, 7)	0.6609382538894616
  (2, 10)	0.2513177760238013
  (3, 0)	0.6598854317699118
  (3, 1)	0.2540693152229076
  (3, 8)	0.6598854317699118
  (3, 10)	0.2540693152229076
  (4, 1)	0.2513177760238013
  (4, 3)	0.6609382538894616
  (4, 5)	0.6609382538894616
  (4, 10)	0.2513177760238013
  (5, 1)	0.2513177760238013
  (5, 4)	0.6609382538894616
  (5, 7)	0.6609382538894616
  (5, 10)	0.2513177760238013
  (6, 1)	0.2513177760238013
  :	:
  (3993, 10)	0.2540693152229076
  (3994, 1)	0.2537535642970132
  (3994, 6)	0.6600069155748003
  (3994, 10)	0.2537535642970132
  (3994, 11)	0.6600069155748003
  (3995, 1)	0.2513177760238013
  (3995, 4)	0.6609382538894616
  (3995, 7)	0.6609382538894616
  (3995, 10)	0.2513177760238013
  (3

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [31]:
model = LogisticRegression()
  

In [32]:
model.fit(X_train, Y_train)

In [33]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [34]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.511875


In [35]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [36]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.535


In [37]:
X_new = X_test[5]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

['Real']
The news is Fake


In [38]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

['Fake']
The news is Fake
