<a href="https://colab.research.google.com/github/nikhilsingh132/FAKE-NEWS-ML-PROJECT/blob/main/FAKE_NEWS_DETECTION_ML_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CONTENT OF DATASET**
1. ID: Unique id for a news article
2. TITLE: Title of a news article
3. AUTHOR: Author of the news article
4. TEXT: The text of the article
5. LABEL: A label that marks whether the news article is real or fake(real=0 & fake=1)

## **IMPORTING LIBRARIES**

In [1]:
import numpy as np
import pandas as pd
import re  
# RE is regular expressions according to which we can select any specific kind of text from expression given
from nltk.corpus import stopwords
# STOPWORDS are most common words in english or any other language.EX:- a,an,the,him,our,etc.
from nltk.stem.porter import PorterStemmer
# Stemming is the process of reducing a word to its Root word.EX:- (actor, actress, acting) --> act
from sklearn.feature_extraction.text import TfidfVectorizer
# TfidfVectorizer is used to find the importance of a word in different sentences (related to frequency stuff)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
fakenews_data = pd.read_csv('dataset.csv')

In [3]:
fakenews_data.shape

(20800, 5)

In [4]:
fakenews_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


# **DATA PREPROCESSING**

In [5]:
fakenews_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
fakenews_data=fakenews_data.dropna(how='any')

In [7]:
fakenews_data.shape

(18285, 5)

In [8]:
fakenews_data['content'] = fakenews_data['author']+' '+fakenews_data['title']

In [9]:
print(fakenews_data['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 18285, dtype: object


# **STEMMING**

In [10]:
port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    # removing all characters except alphabets
    stemmed_content = stemmed_content.lower()
    # convert everything into small letters
    stemmed_content = stemmed_content.split()
    # this will convert content into list
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    # this will remove all letters which are stopwords and also stem the word
    stemmed_content = ' '.join(stemmed_content)
    # connecting all list data in string form
    return stemmed_content

In [11]:
fakenews_data['content'] = fakenews_data['content'].apply(stemming)

In [12]:
print(fakenews_data['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 18285, dtype: object


In [13]:
X = fakenews_data['content'].values
Y = fakenews_data['label'].values

In [14]:
print(X)
print(Y)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']
[1 0 1 ... 0 1 1]


In [15]:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [16]:
print(X)

  (0, 14626)	0.2853880981846006
  (0, 12567)	0.25566372256502734
  (0, 8310)	0.3609049070394367
  (0, 8048)	0.29347549279156676
  (0, 7190)	0.24556189342497173
  (0, 6552)	0.21745594418933306
  (0, 4637)	0.23016077319140021
  (0, 3543)	0.2684494960336511
  (0, 3359)	0.3609049070394367
  (0, 2757)	0.2466340295002162
  (0, 2312)	0.3745612250433202
  (0, 247)	0.26982554594264346
  (1, 15663)	0.3053027963338981
  (1, 6377)	0.19285723710368197
  (1, 5140)	0.7119376870709988
  (1, 3328)	0.2623789770430963
  (1, 2619)	0.19368327535633711
  (1, 2066)	0.38191890436039194
  (1, 1764)	0.1509985164277699
  (1, 1391)	0.29617980713962144
  (2, 14560)	0.4180284001448272
  (2, 8973)	0.4948460479407663
  (2, 5579)	0.3490632212946542
  (2, 5031)	0.38709995799949964
  (2, 2895)	0.4581003415623782
  :	:
  (18282, 12239)	0.252743907968046
  (18282, 11515)	0.2748252773264482
  (18282, 11321)	0.24588400571511215
  (18282, 9605)	0.07665665104558947
  (18282, 8942)	0.1712955017712004
  (18282, 8879)	0.29296479

# **SPLITTING DATA**

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, stratify=Y, random_state=10)
# stratify is used so that the proportion of real and fake news in train data is almost same in test data

In [18]:
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression()

**USING LOGISTIC REGRESSION ON TRAINING DATA**

In [19]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9891397765450426


**USING LOGISTIC REGRESSION ON TESTING DATA**

In [20]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.983230040102078


# **PREDICTING THE DATA FROM OUR TEST DATA**

In [21]:
X_new = X_test[0]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [22]:
print(Y_test[0])

1
