In [1]:
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

from tensorflow.keras.layers import Dense, LSTM, Input,GlobalMaxPool1D,Dropout
from keras.utils.vis_utils import plot_model
from tensorflow.keras.layers import Embedding, Input,GlobalMaxPool1D,Dropout,concatenate
from tensorflow.keras.models import Model, Sequential

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score

In [117]:
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/max/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
data = pd.read_csv("fake_news_labelled.csv")
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
# drop nulls, keep only title, text and labels
data = data[['title', "text","label"]].dropna()

In [6]:
# 10 articles labelled fake news
data[data.label==1][:10]

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1
6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,Ever wonder how Britain’s most iconic pop pian...,1
12,Russian Researchers Discover Secret Nazi Milit...,The mystery surrounding The Third Reich and Na...,1
13,US Officials See No Link Between Trump and Russia,Clinton Campaign Demands FBI Affirm Trump's Ru...,1
14,"Re: Yes, There Are Paid Government Trolls On S...","Yes, There Are Paid Government Trolls On Socia...",1
17,Anonymous Donor Pays $2.5 Million To Release E...,A Caddo Nation tribal leader has just been fre...,1
18,FBI Closes In On Hillary!,FBI Closes In On Hillary! Posted on Home » Hea...,1


In [7]:
# 10 articles labelled credible
data[data.label==0][:10]

Unnamed: 0,title,text,label
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
5,Jackie Mason: Hollywood Would Love Trump if He...,"In these trying times, Jackie Mason is the Voi...",0
7,Benoît Hamon Wins French Socialist Party’s Pre...,"PARIS — France chose an idealistic, traditi...",0
8,Excerpts From a Draft Script for Donald Trump’...,Donald J. Trump is scheduled to make a highly ...,0
9,"A Back-Channel Plan for Ukraine and Russia, Co...",A week before Michael T. Flynn resigned as nat...,0
10,Obama’s Organizing for Action Partners with So...,"Organizing for Action, the activist group that...",0
11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",The BBC produced spoof on the “Real Housewives...,0
15,"In Major League Soccer, Argentines Find a Home...",Guillermo Barros Schelotto was not the first A...,0
16,Wells Fargo Chief Abruptly Steps Down - The Ne...,The scandal engulfing Wells Fargo toppled its ...,0
19,Chuck Todd: ’BuzzFeed Did Donald Trump a Polit...,Wednesday after Donald Trump’s press confere...,0


In [30]:
X = data.drop('label',axis=1)
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53421)

In [118]:
X_test[0:1]

Unnamed: 0,title,text
18889,BREAKING – Obama Betrays America in MASSIVE Mo...,Email Print President Obama has proven he is n...


In [None]:
vo_sizevo_size = 500
X_train.reset_index(inplace=True)

ps_title_tr = PorterStemmer()
ps_text_tr = PorterStemmer()
corpus_title_tr = []
corpus_text_tr = []

iters = len(X_train)

for i in range(0, iters):
    print("Status: %s / %s" %(i, iters), end="\r")
    
    #preproc title
    review = re.sub('[^a-zA-Z]', ' ',X_train['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps_title_tr.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_title_tr.append(review)
    
    #preproc text
    review = re.sub('[^a-zA-Z]', ' ',X_train['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps_text_tr.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_text_tr.append(review)

onehot_title_tr = [one_hot(words, vo_size) for words in corpus_title_tr]
onehot_text_tr = [one_hot(words, vo_size) for words in corpus_text_tr]

sent_length_title = 20
sent_length_text = 1000
embedded_doc_title = pad_sequences(onehot_title_tr, padding='pre', maxlen=sent_length_title)
embedded_doc_text = pad_sequences(onehot_text_tr, padding='pre', maxlen=sent_length_text)

X_train_title = np.array(embedded_doc_title)
X_train_text = np.array(embedded_doc_text)
print(X_train_title[0:5])
print(X_train_text[0:5])

print(X_train_title.shape)

In [31]:
X_test

Unnamed: 0,title,text
18889,BREAKING – Obama Betrays America in MASSIVE Mo...,Email Print President Obama has proven he is n...
812,Schools All Over America Are Closing On Electi...,Schools All Over America Are Closing On Electi...
19487,Why Do Health Costs Keep Rising? These People ...,"DANVILLE, Pa. — The Geisinger Health Plan, ..."
7087,’We’re Digging Coal Again’: Trump Celebrates O...,President Donald Trump celebrated the opening ...
9270,Illegal Immigrant Caught at Border with Child ...,"McALLEN, Texas — A Salvadoran national has ..."
...,...,...
14417,Humana to Drop Out of Obamacare Exchanges in 2...,The health insurance giant Humana will exit th...
158,"Burlesque Dancer Fired, Investigated by Secret...",The Secret Service is investigating a burlesqu...
13999,Look how easily you can be robbed while pumpin...,Print \nAn increasingly common form of theft h...
14263,Kurds decide to get on US nerves,Kurds decide to get on US nerves 07.11.2016 | ...


In [26]:
X_final_title=np.array(embedded_doc_title)
X_final_text=np.array(embedded_doc_text)
y_final=np.array(y_train)
print(X_final_title.shape,y_final.shape)
print(X_final_text.shape,y_final.shape)

(0, 20) (16162,)
(0, 1000) (16162,)


In [44]:
X_test[0:1]

Unnamed: 0,title,text
18889,BREAKING – Obama Betrays America in MASSIVE Mo...,Email Print President Obama has proven he is n...


In [34]:
# test
messages=X_test.copy()
messages.reset_index(inplace=True)

ps_title =PorterStemmer()
ps_text =PorterStemmer()
corpus_title = []
corpus_text = []

for i in range(0, len(messages)):
    print("Status: %s / %s" %(i, len(messages)), end="\r")
    
    #preproc title
    review = re.sub('[^a-zA-Z]', ' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps_title.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_title.append(review)
    
    #preproc text
    review = re.sub('[^a-zA-Z]', ' ',messages['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps_text.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_text.append(review)

onehot_rep_title = [one_hot(words, vo_size) for words in corpus_title]
onehot_rep_text = [one_hot(words, vo_size) for words in corpus_text]

sent_length_title = 20
sent_length_text = 1000
embedded_doc_title=pad_sequences(onehot_rep_title, padding='pre', maxlen=sent_length_title)
embedded_doc_text=pad_sequences(onehot_rep_text, padding='pre', maxlen=sent_length_text)

X_final_title=np.array(embedded_doc_title)
X_final_text=np.array(embedded_doc_text)
print(X_final_title.shape)
print(X_final_text.shape)




(4041, 20)40 / 4041
(4041, 1000)


In [22]:
embedding_vec_title = 10
embedding_vec_text = 100

In [23]:
input_title = Input(shape=(sent_length_title,))
input_text = Input(shape=(sent_length_text,))

emb_title = Embedding(vo_size, embedding_vector_feature_title)(input_title)
lstm_title = LSTM(128, return_sequences=False)(emb_title)

emb_text = Embedding(vo_size, embedding_vector_feature_text)(input_text)
lstm_text = LSTM(128, return_sequences=True)(emb_text)

max_pool_text = GlobalMaxPool1D()(lstm_text)
dropout_1_text = Dropout(0.1)(max_pool_text)
dense_1_text = Dense(50, activation='relu')(dropout_1_text)
dropout_2_text = Dropout(0.1)(dense_1_text)

out = concatenate([lstm_title,dropout_2_text],axis=-1)
output=Dense(1, activation='sigmoid')(out)

model = Model(inputs=[input_title, input_text], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 1000)]       0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 1000, 100)    50000       ['input_4[0][0]']                
                                                                                                  
 lstm_3 (LSTM)                  (None, 1000, 128)    117248      ['embedding_3[0][0]']            
                                                                                                  
 global_max_pooling1d_1 (Global  (None, 128)         0           ['lstm_3[0][0]']                 
 MaxPooling1D)                                                                              

In [37]:
model.fit(x=[X_train_title, X_train_text], y=y_final, batch_size=128, epochs=10, verbose=1, validation_split=0.2)


Epoch 1/10

2022-05-15 21:36:20.940453: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-15 21:36:21.025219: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-15 21:36:21.025266: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x3044f3e20>

In [38]:
# predict final
y_pred_final = model.predict([X_final_title,X_final_text])
y_prob = pd.DataFrame(y_pred_final)
y_prob['0'] = 1 - y_prob[0]
y_class = pd.DataFrame(y_prob.values.argmax(axis=-1))
y_class[0] = np.where(y_class[0]==1, 0, 1)



2022-05-16 01:07:14.794928: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-16 01:07:14.870265: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-16 01:07:14.870300: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [116]:
y_pred_final

array([[8.3586490e-01],
       [9.9996650e-01],
       [5.1796576e-08],
       ...,
       [9.9992287e-01],
       [9.9996603e-01],
       [6.0107550e-03]], dtype=float32)

In [40]:
# nice, 91.6% 

In [39]:
accuracy_score(y_test, y_class[0])

0.9163573372927494

## Gets harder with Custom Input

In [106]:
# https://www.theguardian.com/politics/2022/may/15/margaret-thatcher-statue-grantham-egged-within-hours-of-it-being-installed
title1 = "Margaret Thatcher statue egged within hours of it being installed"
text1 = """Warnings that a new statue of Margaret Thatcher would attract egg-throwing protests came true within two hours of it being installed in her home town of Grantham on Sunday.

The bronze statue was, without ceremony, placed on a 3-metre (10ft) high plinth to make it more difficult for protesters to inflict any damage.

Shortly afterwards a man was seen throwing eggs from behind a temporary fence and, when one connected, a cry of “oi” could be heard.

The egg-throwing came as a surprise to nobody in Grantham. There is pride but also heightened awareness of how divisive a figure Thatcher remains.

After it was installed on a warm Sunday morning, a number of people stopped to take selfies. But loud booing could also be heard from passing motorists.

The Labour councillor Lee Steptoe said the egg-throwing was “absolutely inevitable”. """

In [107]:
def process_input(title, text):
    messages= pd.DataFrame({'title':[title], 'text':[text]})
    messages.reset_index(inplace=True)

    ps_title =PorterStemmer()
    ps_text =PorterStemmer()
    corpus_title = []
    corpus_text = []

    for i in range(0, len(messages)):
        print("Status: %s / %s" %(i, len(messages)), end="\r")

        #preproc title
        review = re.sub('[^a-zA-Z]', ' ',messages['title'][i])
        review = review.lower()
        review = review.split()

        review = [ps_title.stem(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus_title.append(review)

        #preproc text
        review = re.sub('[^a-zA-Z]', ' ',messages['text'][i])
        review = review.lower()
        review = review.split()

        review = [ps_text.stem(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus_text.append(review)

    onehot_rep_title = [one_hot(words, vo_size) for words in corpus_title]
    onehot_rep_text = [one_hot(words, vo_size) for words in corpus_text]

    sent_length_title = 20
    sent_length_text = 1000
    embedded_doc_title=pad_sequences(onehot_rep_title, padding='pre', maxlen=sent_length_title)
    embedded_doc_text=pad_sequences(onehot_rep_text, padding='pre', maxlen=sent_length_text)

    X_final_title=np.array(embedded_doc_title)
    X_final_text=np.array(embedded_doc_text)
    
    return [X_final_title, X_final_text]

In [113]:
process_input(title1, text1)

Status: 0 / 1

[array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         251, 473, 318, 443, 379, 105, 346]], dtype=int32),
 array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0

In [109]:
X_final_title[0:1]

array([[  0,   0,   0,   0,   0,   0,   0, 372, 432, 150,  88, 108, 337,
        317, 416,  57, 360, 304, 380,  82]], dtype=int32)

In [114]:
model.predict(process_input(title1, text1))


Status: 0 / 1

array([[0.9934454]], dtype=float32)

In [111]:
X_test[0:1]['title']

18889    BREAKING – Obama Betrays America in MASSIVE Mo...
Name: title, dtype: object

In [112]:
# test
messages=X_test[0:1].copy()
messages.reset_index(inplace=True)

ps_title =PorterStemmer()
ps_text =PorterStemmer()
corpus_title = []
corpus_text = []

for i in range(0, len(messages)):
    print("Status: %s / %s" %(i, len(messages)), end="\r")
    
    #preproc title
    review = re.sub('[^a-zA-Z]', ' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps_title.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_title.append(review)
    
    #preproc text
    review = re.sub('[^a-zA-Z]', ' ',messages['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps_text.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_text.append(review)

onehot_rep_title = [one_hot(words, vo_size) for words in corpus_title]
onehot_rep_text = [one_hot(words, vo_size) for words in corpus_text]

sent_length_title = 20
sent_length_text = 1000
embedded_doc_title=pad_sequences(onehot_rep_title, padding='pre', maxlen=sent_length_title)
embedded_doc_text=pad_sequences(onehot_rep_text, padding='pre', maxlen=sent_length_text)

X_final_title=np.array(embedded_doc_title)
X_final_text=np.array(embedded_doc_text)
print(X_final_title)
print(X_final_text)



Status: 0 / 1[[  0   0   0   0   0   0   0 372 432 150  88 108 337 317 416  57 360 304
  380  82]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   