In [1]:
import numpy as np
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
news_df = pd.read_csv('train.csv', nrows=3000)

In [3]:
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


# About the Dataset:

Unnamed:0 : unique id for a news article                                     
title: the title of a news article                                                                      
text: the text of the article; could be incomplete                                        
label: a label that marks whether the news article is real or fake:                              
    1: Fake news                                                 
    0: real News                                        

# 1 Preprocessing 

In [4]:
news_df.isnull().sum()

Unnamed: 0     0
title         23
text           1
label          0
dtype: int64

In [5]:
news_df.shape

(3000, 4)

In [6]:
news_df = news_df.fillna(' ')

In [7]:
news_df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [8]:
news_df['content'] = news_df['title']

In [9]:
news_df

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,1,,Did they post their votes for Hillary already?,1,
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...
...,...,...,...,...,...
2995,2995,HOW HILLARY DESTROYED This Man’s Life To Hide ...,Does anyone even care that this American man w...,1,HOW HILLARY DESTROYED This Man’s Life To Hide ...
2996,2996,Republicans gear up for Supreme Court battle a...,WASHINGTON (Reuters) - Republican lawmakers an...,0,Republicans gear up for Supreme Court battle a...
2997,2997,Why’d You Do That? Printing Donald Trump’s Vul...,Coverage of the leaked 2005 audio tape of Don...,0,Why’d You Do That? Printing Donald Trump’s Vul...
2998,2998,Trump says didn't tell Flynn to discuss sancti...,WASHINGTON (Reuters) - President Donald Trump ...,0,Trump says didn't tell Flynn to discuss sancti...


# separating the data & label

In [10]:
X = news_df.drop('label',axis=1)
y = news_df['label']

In [11]:
print(X)

      Unnamed: 0                                              title  \
0              0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1              1                                                      
2              2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3              3  Bobby Jindal, raised Hindu, uses story of Chri...   
4              4  SATAN 2: Russia unvelis an image of its terrif...   
...          ...                                                ...   
2995        2995  HOW HILLARY DESTROYED This Man’s Life To Hide ...   
2996        2996  Republicans gear up for Supreme Court battle a...   
2997        2997  Why’d You Do That? Printing Donald Trump’s Vul...   
2998        2998  Trump says didn't tell Flynn to discuss sancti...   
2999        2999  Bruising debate brings a reluctant Trump aroun...   

                                                   text  \
0     No comment is expected from Barack Obama Membe...   
1        Did they post their 

# Stemming:

Stemming is the process of reducing a word to its Root word

example: hung         hanged        hanging ======hang

# Steps:
lower case                 
splitting                             
removing stopwords                              
stemming                                   

In [12]:
ps = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [13]:
news_df['content'] = news_df['content'].apply(stemming)

In [14]:
news_df['content']

0       law enforc high alert follow threat cop white ...
1                                                        
2       unbeliev obama attorney gener say charlott rio...
3       bobbi jindal rais hindu use stori christian co...
4       satan russia unv imag terrifi new supernuk wes...
                              ...                        
2995      hillari destroy man life hide incompet benghazi
2996      republican gear suprem court battl scalia death
2997              print donald trump vulgar new york time
2998    trump say tell flynn discuss sanction russia w...
2999    bruis debat bring reluct trump around afghanis...
Name: content, Length: 3000, dtype: object

# separating the data and label


In [15]:
X = news_df['content'].values
y = news_df['label'].values

# converting the textual data to numerical data

In [16]:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [17]:
print(X)

  (0, 120)	0.3455615443805985
  (0, 490)	0.33704443213108315
  (0, 1075)	0.26678178055236856
  (0, 1604)	0.3131813231600438
  (0, 1897)	0.30872892864241513
  (0, 1990)	0.38836586853071775
  (0, 2288)	0.2857229957314485
  (0, 2826)	0.2461416587102448
  (0, 4963)	0.2603281172508895
  (0, 4994)	0.2685448118439821
  (0, 5312)	0.14168086062411728
  (0, 5429)	0.2222785497392055
  (2, 297)	0.3011484453176523
  (2, 744)	0.3240946992467278
  (2, 819)	0.3423081656375447
  (2, 2031)	0.2723164103290891
  (2, 2323)	0.27001231886790056
  (2, 3419)	0.2356275529848124
  (2, 3447)	0.17441228850470553
  (2, 3653)	0.28603413424477786
  (2, 3892)	0.22709080665381978
  (2, 4210)	0.3423081656375447
  (2, 4329)	0.1631232606660867
  (2, 4737)	0.2029511635755935
  (2, 5178)	0.3423081656375447
  :	:
  (2996, 4332)	0.4570402570676526
  (2996, 4854)	0.3505990374124837
  (2997, 1453)	0.32781906124928545
  (2997, 3372)	0.2285267242910502
  (2997, 3843)	0.5862909298059142
  (2997, 5017)	0.24437959105761645
  (2997, 

# Splitting the dataset to training & test data

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=2)

In [19]:
X_train.shape

(2400, 5563)

# Training the Model: Logistic Regression

In [20]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [21]:
# on training set
train_y_pred = model.predict(X_train)
print(accuracy_score(train_y_pred,Y_train))

0.955


In [22]:
# on testing set
testing_y_pred = model.predict(X_test)
print(accuracy_score(testing_y_pred,Y_test))

0.855


# Detection System

In [30]:
input_data = X_test[10]
prediction = model.predict(input_data)

In [31]:
if prediction[0] == 0:
    print('The News Is Real')
else:
    print('The News is Fake')

The News is Fake


In [33]:
news_df['content'][10]

'gop senat smack punchabl alt right nazi internet'