#**Importing the Dependencies**

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords #stopwords noice that dont add any value
from nltk.stem.porter import PorterStemmer #removes prefix and suffix and gives root words
from sklearn.feature_extraction.text import TfidfVectorizer #convert text into feature vectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#printing the stop words in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#**Data Pre-processing**

In [None]:
#loading the dataset into the pandas dataframe
news_dataset = pd.read_csv('/content/train.csv')

In [None]:
news_dataset.shape

(20800, 5)

In [None]:
print(news_dataset.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [None]:
#counting the number of missing values
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [None]:
# we have lasge data set so replaceing the null value with null string
news_dataset = news_dataset.fillna('')

In [None]:
#merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [None]:
print(news_dataset.content)

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [None]:
#seprating the data label
X = news_dataset.drop(columns = 'label', axis = 1)
Y = news_dataset['label']

In [None]:
print(X)
print(Y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

Stemming:

stemming is the process of reducing a word into root word

example:
actor, actress, acting --> act

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content): #content is anything
  stemmed_content = re.sub('[^a-zA-Z]',' ',content) #removes without alphabet values comma and other symbols is replaced by space(' ')
  stemmed_content = stemmed_content.lower() # makes lower all
  stemmed_content = stemmed_content.split() # splitted and stemmed
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ''.join(stemmed_content)
  return stemmed_content

In [None]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [None]:
#printing the content
print(news_dataset['content']) #no symbols, capitals

0        darrellucuhousdemaidevenseecomeyletterjasoncha...
1        danieljflynnflynnhillariclintonbigwomancampubr...
2                        consortiumnewcomtruthmightgetfire
3        jessicapurkissciviliankillsinglusairstrikidentifi
4        howardportnoyiranianwomanjailfictionunpublishs...
                               ...                        
20795      jeromhudsonrappertrumpposterchildwhitesupremaci
20796    benjaminhoffmannflplayoffschedulmatchupoddnewy...
20797    michaeljdelamercrachelabrammacisaidreceivtakeo...
20798        alexansarinatorussiaholdparallelexercisbalkan
20799                                davidswansonkeepfaliv
Name: content, Length: 20800, dtype: object


In [None]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [None]:
print(X)

['darrellucuhousdemaidevenseecomeyletterjasonchaffetztweet'
 'danieljflynnflynnhillariclintonbigwomancampubreitbart'
 'consortiumnewcomtruthmightgetfire' ...
 'michaeljdelamercrachelabrammacisaidreceivtakeovapproachhudsonbaynewyorktime'
 'alexansarinatorussiaholdparallelexercisbalkan' 'davidswansonkeepfaliv']


In [None]:
print(Y)

[1 0 1 ... 0 1 1]


In [None]:
#converting the textual data to numerical data
vectorizer = TfidfVectorizer() #term freq inverse doc freq
vectorizer.fit(X)
X = vectorizer.transform(X) #y is already in num

In [None]:
print(X)

  (0, 5358)	1.0
  (1, 5051)	1.0
  (2, 4667)	1.0
  (3, 10068)	1.0
  (4, 8336)	1.0
  (5, 5141)	1.0
  (6, 12328)	1.0
  (7, 1003)	1.0
  (8, 7111)	1.0
  (9, 13247)	1.0
  (10, 85)	1.0
  (11, 4358)	1.0
  (12, 1157)	1.0
  (13, 9431)	1.0
  (14, 1691)	1.0
  (15, 9059)	1.0
  (16, 13377)	1.0
  (17, 17920)	1.0
  (18, 5952)	1.0
  (19, 9596)	1.0
  (20, 14535)	1.0
  (21, 9979)	1.0
  (22, 15437)	1.0
  (23, 13050)	1.0
  (24, 6030)	1.0
  :	:
  (20775, 1820)	1.0
  (20776, 9289)	1.0
  (20777, 412)	1.0
  (20778, 14343)	1.0
  (20779, 18568)	1.0
  (20780, 4234)	1.0
  (20781, 8106)	1.0
  (20782, 5955)	1.0
  (20783, 4049)	1.0
  (20784, 5703)	1.0
  (20785, 1607)	1.0
  (20786, 7814)	1.0
  (20787, 8589)	1.0
  (20788, 10187)	1.0
  (20789, 19697)	1.0
  (20790, 291)	1.0
  (20791, 5016)	1.0
  (20792, 10566)	1.0
  (20793, 16782)	1.0
  (20794, 12191)	1.0
  (20795, 9993)	1.0
  (20796, 2412)	1.0
  (20797, 13456)	1.0
  (20798, 885)	1.0
  (20799, 5619)	1.0


#**splitting the data**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify= Y, random_state= 2) #stratify Y is for equal distribution to 1 and 0

#**Training the model**

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

#Model evaluation

In [None]:
#accuracy score on traing data
X_train_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_pred, Y_train)
print(training_data_accuracy*100)

99.27283653846153


In [None]:
X_test_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_pred, Y_test)
print(test_data_accuracy*100)

53.31730769230769


#Predcitive system

In [None]:
X_new = X_test[0] #0 first row
prediction = model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
  print('The news is real')
else:
  print('The news is fake')

[0]
The news is real


In [None]:
print(Y_test[0])

1
