In [1]:
# !pip install sklearn
# !pip install pydot
# !pip install graphviz
# !pip install matplotlib
# !pip install plotly
# !pip install cufflinks==0.8.2

In [2]:
import time
start_time = time.time()

In [3]:
# Importing the Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import re
import unicodedata
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from keras import Model
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Flatten, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot

In [4]:
# Reading the files from the folder

fake_news_df = pd.read_csv('Fake.csv')
true_news_df = pd.read_csv('True.csv')

In [5]:
# Check the random instances of the data

display(fake_news_df.sample(2), true_news_df.sample(2))

Unnamed: 0,title,text,subject,date
3080,Republicans BETRAY Paul Ryan And Mitch McConn...,Many Republicans in Congressional leadership a...,News,"January 10, 2017"
8320,You Won’t Believe What Hit This Pro-TPP Polit...,George W. Bush was famously attacked with flyi...,News,"February 5, 2016"


Unnamed: 0,title,text,subject,date
17864,Pentagon says diplomatic tension with Turkey n...,WASHINGTON (Reuters) - A diplomatic dispute be...,worldnews,"October 10, 2017"
16604,Missing persons agency opens high-tech global ...,THE HAGUE (Reuters) - The organization that id...,worldnews,"October 24, 2017"


## Data Cleaning Functions

In [6]:
## Data Cleaning  ###

# Remove the HTML text/phases from the data
def remove_html(text):
    new_text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", text)
    return(new_text)

# Count the lenght of the string
def len_text(text):
    text_len = len(text.split())
    return(text_len)
    
# Remove White Spaces
def remove_white_space(text):
    text = re.sub("^\s+|\s+$", "", text, flags=re.UNICODE) # Remove spaces both in beginining and in the end of a string
    text = " ".join(re.split("\s+", text, flags=re.UNICODE)) # Remove spaces from duplicate spaces
    return(text)

# Removing the Accented Chars
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


# Removing Special characters
def remove_special_characters(text):
    pattern = r'[^a-zA-z0-9\s.]' 
    text = re.sub(pattern, '', text)
    return text

## Data Analysis (Before Data Cleaning)

In [7]:
display(fake_news_df.sample(2), true_news_df.sample(2))

Unnamed: 0,title,text,subject,date
18906,THE HORRIBLE END GAME: BERNIE SANDERS Calls fo...,,left-news,"Mar 26, 2017"
3380,WATCH: Kellyanne Conway Accuses Obama Of Bein...,If someone had called Ronald Reagan un-America...,News,"December 16, 2016"


Unnamed: 0,title,text,subject,date
16529,Bosnian pensioners stage street protests for p...,SARAJEVO (Reuters) - Thousands of pensioners f...,worldnews,"October 25, 2017"
21186,France says North Korea close to long-range mi...,PARIS (Reuters) - France s foreign minister sa...,worldnews,"September 1, 2017"


In [8]:
display(fake_news_df.shape, true_news_df.shape)

(23481, 4)

(21417, 4)

#### FAKE News

In [9]:
display(set(fake_news_df['subject']), set(true_news_df['subject']))

{'Government News', 'Middle-east', 'News', 'US_News', 'left-news', 'politics'}

{'politicsNews', 'worldnews'}

In [10]:
# Get the length of each instance
fake_news_df['len_sent'] = fake_news_df['text'].apply(lambda x: len_text(x)) 

In [11]:
fake_news_df['subject'].value_counts()

News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: subject, dtype: int64

In [12]:
# Analyse the description of each group in the subject feature
fake_news_df.groupby(['subject']).describe()

Unnamed: 0_level_0,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
subject,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Government News,1570.0,387.404459,389.428324,0.0,155.0,304.5,506.0,4547.0
Middle-east,778.0,772.548843,1040.851331,24.0,182.25,351.5,919.5,8135.0
News,9050.0,441.253812,152.060179,36.0,344.0,410.0,506.0,3909.0
US_News,783.0,780.527458,1049.778528,24.0,183.0,355.0,927.5,8135.0
left-news,4459.0,392.736264,363.389015,0.0,186.5,318.0,509.5,7033.0
politics,6841.0,346.752083,369.246542,0.0,120.0,276.0,462.0,7033.0


In [13]:
# random.seed(123)
# fake_news_df[fake_news_df['subject']=='politics']['text'].sample(2, random_state=123).to_list()

##### Conclusion for Statistical analysis on Fake News Dataframe

- Maximum instances of possess by "News" with 9050 instances, followed by "politics" [6841], "left-news" [4459], "Govt News" [1570], "US_news" [783], and "Middle-east" [778]
- The news corpus is largely aligned towards the "left-news" and "politics" 
- The minimum number of text in Middle-east, News and US_News starts from 24, 36 and 24, respectively.
- There are various instances in Govt News, left-news and politics where the news is empty
- Many dirty records can be found in the data i.e. 
    - HTML characters/Code
    - White Spaces in the text
    - Removing Ascented Characters
    - Removing Special Characters

##### Cleaning Aspects

- Remove the instances with length less than 10
- Split the total instances by each group in same proportion

#### TRUE News

In [14]:
display(set(fake_news_df['subject']), set(true_news_df['subject']))

{'Government News', 'Middle-east', 'News', 'US_News', 'left-news', 'politics'}

{'politicsNews', 'worldnews'}

In [15]:
# Get the length of each instance
true_news_df['len_sent'] = true_news_df['text'].apply(lambda x: len_text(x)) 

In [16]:
true_news_df['subject'].value_counts()

politicsNews    11272
worldnews       10145
Name: subject, dtype: int64

In [17]:
# Analyse the description of each group in the subject feature
true_news_df.groupby(['subject']).describe()

Unnamed: 0_level_0,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
subject,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
politicsNews,11272.0,408.09599,287.852067,0.0,153.75,385.0,571.0,5172.0
worldnews,10145.0,360.689601,255.46069,23.0,146.0,324.0,486.0,2927.0


##### Conclusion for Statistical analysis on True News Dataframe

- Maximum instances of possess by "politicsNews" with 11272 instances, followed by "worldnews" [10145]
- There are various instances in "politicsNews" where the news is empty

##### Cleaning Aspects

- Remove the instances with length less than 10

### Overall Conclusion
- Set minimum length of the instances to 20
- Make sure to remove bais in the data i.e. all group should possess same amount of instances (data)
- Combine the fields (if required) i.e. combining the "subjects" of the data 

## Data Analysis (After Data Cleaning)

#### FAKE News

In [18]:
fake_news_df.head(1)

Unnamed: 0,title,text,subject,date,len_sent
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",495


In [19]:
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_html(x))
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_accented_chars(x))
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_special_characters(x))
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_white_space(x))
fake_news_df['len_sent'] = fake_news_df['text'].apply(lambda x: len_text(x)) 

In [20]:
fake_news_df.sample(1)

Unnamed: 0,title,text,subject,date,len_sent
22526,Boiler Room #63 – Us and THEM!,Tune in to the Alternate Current Radio Network...,US_News,"July 6, 2016",127


In [21]:
# Check the number of sentences below 10 word length
fake_news_df[fake_news_df['len_sent'] < 21].sort_values(by="len_sent",ascending=False)

Unnamed: 0,title,text,subject,date,len_sent
12616,CROOKED SOROS: Trump Will Win Popular Vote In ...,Very interesting remarks from a guy who is in ...,politics,"Oct 26, 2016",20
19279,THE BEST WAY To Clear Soros’ Anti-Trump Rioter...,disruptj20 protesters on the move many leaving...,left-news,"Jan 20, 2017",20
13697,WHOA! Did Donald Trump Just Imply Obama Is Wor...,And if Trump did indeed imply Obama was workin...,politics,"Jun 13, 2016",20
20217,SHOCKING SUMMARY Of The DNC Convention So Far…...,What a crazy group of professional agitators a...,left-news,"Jul 26, 2016",20
13731,WHAAAT? DNC PLATFORM MEMBER Makes EXTREME Stat...,Wow This lady who is a DNC Platform Committee ...,politics,"Jun 8, 2016",20
...,...,...,...,...,...
11960,DEFIANT DEMOCRATS Announce Effort To Rehang Pa...,,politics,"Jan 10, 2017",0
11957,SICKENING! MTV HOST MOCKS Senator Jeff Session...,,politics,"Jan 10, 2017",0
11943,“YOU ARE FAKE NEWS!” TRUMP DESTROYS CNN Right ...,,politics,"Jan 11, 2017",0
11936,“LITTLE” MARCO RUBIO GRILLS Trump’s Secretary ...,,politics,"Jan 11, 2017",0


In [22]:
fake_news_df = fake_news_df[fake_news_df['len_sent'] > 20].reset_index(drop=True)
display(fake_news_df.shape, fake_news_df.sample(2))

(22298, 5)

Unnamed: 0,title,text,subject,date,len_sent
6834,Conservatives Can’t Believe Canadian PM Would...,Canada s new liberal Prime Minister went viral...,News,"April 18, 2016",599
6852,Republicans Punish Georgia Governor For Refus...,Georgia conservatives really wanted to enshrin...,News,"April 17, 2016",394


In [23]:
fake_news_df['subject'].value_counts()

News               9050
politics           6076
left-news          4175
Government News    1436
US_News             783
Middle-east         778
Name: subject, dtype: int64

In [24]:
# Analyse the description of each group in the subject feature
fake_news_df.groupby(['subject']).describe()

Unnamed: 0_level_0,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
subject,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Government News,1436.0,422.641365,388.364912,21.0,197.0,328.0,532.5,4547.0
Middle-east,778.0,770.447301,1039.657765,24.0,181.0,350.5,919.25,8123.0
News,9050.0,440.269945,151.671089,34.0,343.0,409.0,505.0,3909.0
US_News,783.0,778.392082,1048.512705,24.0,181.0,354.0,927.0,8123.0
left-news,4175.0,418.351856,360.366879,21.0,212.0,337.0,525.0,7019.0
politics,6076.0,389.046741,369.820987,21.0,182.0,306.0,491.0,7019.0


In [25]:
# Adding real_fake feature in order to identify the True or Fake News
fake_news_df['real_or_fake'] = 0

In [26]:
print("Shape of Fake News: ", fake_news_df.shape)

Shape of Fake News:  (22298, 6)


#### TRUE News

In [27]:
true_news_df['text'] = true_news_df['text'].apply(lambda x: remove_html(x))
true_news_df['text'] = true_news_df['text'].apply(lambda x: remove_accented_chars(x))
true_news_df['text'] = true_news_df['text'].apply(lambda x: remove_special_characters(x))
true_news_df['text'] = true_news_df['text'].apply(lambda x: remove_white_space(x))
true_news_df['len_sent'] = true_news_df['text'].apply(lambda x: len_text(x)) 

In [28]:
# Check the number of sentences below 10 word length
true_news_df[true_news_df['len_sent'] < 21].sort_values(by="len_sent",ascending=False)

Unnamed: 0,title,text,subject,date,len_sent
8970,Graphic: Supreme Court roundup,,politicsNews,"June 16, 2016",0


In [29]:
# Analyse the description of each group in the subject feature
true_news_df.groupby(['subject']).describe()

Unnamed: 0_level_0,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
subject,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
politicsNews,11272.0,406.239088,287.164243,0.0,152.0,383.0,569.0,5141.0
worldnews,10145.0,358.979694,254.80338,22.0,145.0,323.0,485.0,2925.0


In [30]:
true_news_df = true_news_df[true_news_df['len_sent'] > 20].reset_index(drop=True)
display(true_news_df.shape, true_news_df.sample(2))

(21416, 5)

Unnamed: 0,title,text,subject,date,len_sent
9218,Judge issues final order upholding Alabama sam...,Reuters A federal judge in Alabama has issued ...,politicsNews,"June 8, 2016",357
17543,Facebook will help investigators release Russi...,WASHINGTON Reuters Facebook Inc FB.O Chief Ope...,worldnews,"October 11, 2017",771


In [31]:
true_news_df['subject'].value_counts()

politicsNews    11271
worldnews       10145
Name: subject, dtype: int64

In [32]:
# Analyse the description of each group in the subject feature
true_news_df.groupby(['subject']).describe()

Unnamed: 0_level_0,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
subject,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
politicsNews,11271.0,406.275131,287.151485,21.0,152.0,383.0,569.0,5141.0
worldnews,10145.0,358.979694,254.80338,22.0,145.0,323.0,485.0,2925.0


In [33]:
# Adding real_fake feature in order to identify the True or Fake News
true_news_df['real_or_fake'] = 1

In [34]:
print("Shape of True News: ", true_news_df.shape)

Shape of True News:  (21416, 6)


In [35]:
# Make sure to have same number of news instances in both the dataframe
fake_news_df = fake_news_df.sample(true_news_df.shape[0]).reset_index(drop=True)
print("Shape of Fake News: ", fake_news_df.shape)

Shape of Fake News:  (21416, 6)


### Modelling Section - Pre-Requirement

In [36]:
final_df = pd.concat([fake_news_df,true_news_df])
final_df = final_df.sample(frac=1).reset_index(drop=True)
display(final_df.shape, final_df.head(2))

(42832, 6)

Unnamed: 0,title,text,subject,date,len_sent,real_or_fake
0,Putin says Russia will respond if Russian medi...,SOCHI Russia Reuters Russian President Vladimi...,worldnews,"October 19, 2017",63,1
1,Kenya watchdog says investigating police over ...,This September 29 has been corrected to fix da...,worldnews,"September 29, 2017",417,1


In [37]:
### Converting text-information into list
news_text = final_df['text'].to_list()
len(news_text)

42832

In [38]:
# Creating One-Hot encoding for the 'real_or_fake' feature
le = LabelEncoder()
oe = OneHotEncoder(sparse=True)

In [39]:
real_fake = oe.fit_transform(final_df.real_or_fake.values.reshape(-1,1))
# real_fake

In [40]:
MAX_SEQUENCE_LENGTH = 250
MAX_NB_WORDS = 50000
EMBEDDING_DIM = 100
tokenizer =Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

In [41]:
tokenizer.fit_on_texts(final_df['text'].values)  #  to update the internal vocabulary for the texts list
word_index = tokenizer.word_index 
print('Found %s unique tokens.' % len(word_index))

Found 153632 unique tokens.


In [42]:
# print("Word Index Sample: ", word_index)

![Word Index](Image/word_index.JPG)

In [43]:
news_text = tokenizer.texts_to_sequences(final_df['text'].values) # converting tokens of text corpus into a sequence of integers
news_text = pad_sequences(news_text, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', news_text.shape)

Shape of data tensor: (42832, 250)


In [44]:
Y = pd.get_dummies(final_df['real_or_fake']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (42832, 2)


In [45]:
X_train, X_test, y_train, y_test= train_test_split(news_text, Y, test_size=0.2, random_state=100)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(34265, 250) (34265, 2)
(8567, 250) (8567, 2)


In [46]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 5,080,602
Trainable params: 5,080,602
Non-trainable params: 0
_________________________________________________________________
None


In [47]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 5,080,602
Trainable params: 5,080,602
Non-trainable params: 0
_________________________________________________________________


In [48]:
tf.keras.utils.plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [49]:
epochs = 3
batch_size = 32

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [50]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.074
  Accuracy: 0.978


In [56]:
news_content = ["""Russian embassy in Canada weaponizes social media to fuel support for Ukraine invasion."""]
seq = tokenizer.texts_to_sequences(news_content)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ["fake", "real"]
print(pred, labels[np.argmax(pred)])

[[0.06032694 0.93967307]] real


In [57]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 1527.5098209381104 seconds ---
