In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [3]:
fake_df = pd.read_csv('Fake.csv')
real_df = pd.read_csv('True.csv')

In [4]:
fake_df['class'] = 0 
real_df['class'] = 1

In [5]:
fake_df

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [6]:
df = pd.concat([fake_df, real_df], ignore_index=True, sort=False)

In [7]:
df.head(5)

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [8]:
df = df.drop(["title", "subject", "date"], axis=1)
df.head(5)

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


#### Randomly shuffling the dataframe 

In [9]:
df = df.sample(frac = 1)
df.head(5)

Unnamed: 0,text,class
33303,"RICHMOND, Va. (Reuters) - Virginia Governor Te...",1
16271,In what amounts to an 11th hour gift by the ...,0
13875,The Leftist agenda in action blurring the line...,0
10555,A nervous Nancy Pelosi responded Thursday to c...,0
32829,"WASHINGTON (Reuters) - A judge, called a “hate...",1


In [10]:
# Reset index
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [11]:
# Cleaning
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

df["text"] = df["text"].apply(wordopt)

In [12]:
df.head(5)

Unnamed: 0,text,class
0,richmond va reuters virginia governor te...,1
1,in what amounts to an hour gift by the outg...,0
2,the leftist agenda in action blurring the line...,0
3,a nervous nancy pelosi responded thursday to c...,0
4,washington reuters a judge called a hate...,1


#### Defining dependent and independent variable as x and y

In [13]:
x = df["text"]
y = df["class"]

In [14]:
x.head(5)

0    richmond  va   reuters    virginia governor te...
1    in what amounts to an  hour  gift  by the outg...
2    the leftist agenda in action blurring the line...
3    a nervous nancy pelosi responded thursday to c...
4    washington  reuters    a judge  called a  hate...
Name: text, dtype: object

In [15]:
y.head(5)

0    1
1    0
2    0
3    0
4    1
Name: class, dtype: int64

#### Splitting the dataset into training set and testing set. 

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

#### Convert text to vectors

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorization = TfidfVectorizer()
x_train = vectorization.fit_transform(x_train)
x_test = vectorization.transform(x_test)

# Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
LR_model = LogisticRegression()
LR_model.fit(x_train,y_train)

LogisticRegression()

In [21]:
LR_model.score(x_test, y_test)

0.9883296213808463

# Model Testing

In [29]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Real News"

def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    new_x_test = vectorization.transform(new_x_test)

    pred_LR = LR_model.predict(new_x_test)
    
    return LR_model.predict_proba(new_x_test),output_lable(pred_LR[0])

In [30]:
news = str("The head of a conservative Republican faction in the U.S. Congress, who voted this month for")
manual_testing(news)

(array([[0.50858791, 0.49141209]]), 'Fake News')

In [31]:
news = str("Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing")
manual_testing(news)

(array([[0.96143794, 0.03856206]]), 'Fake News')

In [34]:
X_new = x_test[3]

prediction = LR_model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


# LSTM

In [35]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [43]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_vocab, 32),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          320000    
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 128)        49664     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 32)               18560     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 64)                2112      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                

In [38]:
# We are going to use early stop, which stops when the validation loss no longer improve.
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])


In [41]:

history = model.fit(x_train, y_train, epochs=2,validation_split=0.1, batch_size=30, shuffle=True, callbacks=[early_stop])



InvalidArgumentError: Graph execution error:

2 root error(s) found.
  (0) INVALID_ARGUMENT:  TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "c:\Users\ouyan\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 268, in __call__
    return func(device, token, args)

  File "c:\Users\ouyan\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 146, in __call__
    outputs = self._call(device, args)

  File "c:\Users\ouyan\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 153, in _call
    ret = self._func(*args)

  File "c:\Users\ouyan\anaconda3\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 477, in py_method
    return [slice_array(inp) for inp in flat_inputs]

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 477, in <listcomp>
    return [slice_array(inp) for inp in flat_inputs]

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 475, in slice_array
    return training_utils.slice_arrays(data, ind.numpy(),

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 47, in slice_arrays
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 47, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 47, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

TypeError: 'SparseTensor' object is not subscriptable


	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]]
	 [[IteratorGetNext/_2]]
  (1) INVALID_ARGUMENT:  TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "c:\Users\ouyan\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 268, in __call__
    return func(device, token, args)

  File "c:\Users\ouyan\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 146, in __call__
    outputs = self._call(device, args)

  File "c:\Users\ouyan\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 153, in _call
    ret = self._func(*args)

  File "c:\Users\ouyan\anaconda3\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 477, in py_method
    return [slice_array(inp) for inp in flat_inputs]

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 477, in <listcomp>
    return [slice_array(inp) for inp in flat_inputs]

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 475, in slice_array
    return training_utils.slice_arrays(data, ind.numpy(),

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 47, in slice_arrays
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 47, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "c:\Users\ouyan\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 47, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

TypeError: 'SparseTensor' object is not subscriptable


	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_10723]