In [None]:
# !pip install sklearn
# !pip install pydot
# !pip install graphviz

In [None]:
import time
start_time = time.time()

In [None]:
# Importing the Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import re
import unicodedata
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from keras import Model
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Flatten, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

In [None]:
# Reading the files from the folder

fake_news_df = pd.read_csv('Fake.csv')
true_news_df = pd.read_csv('True.csv')

In [None]:
# Check the random instances of the data

display(fake_news_df.sample(2), true_news_df.sample(2))

## Data Cleaning Functions

In [None]:
## Data Cleaning  ###

# Remove the HTML text/phases from the data
def remove_html(text):
    new_text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", text)
    return(new_text)

# Count the lenght of the string
def len_text(text):
    text_len = len(text.split())
    return(text_len)
    
# Remove White Spaces
def remove_white_space(text):
    text = re.sub("^\s+|\s+$", "", text, flags=re.UNICODE) # Remove spaces both in beginining and in the end of a string
    text = " ".join(re.split("\s+", text, flags=re.UNICODE)) # Remove spaces from duplicate spaces
    return(text)

# Removing the Accented Chars
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


# Removing Special characters
def remove_special_characters(text):
    pattern = r'[^a-zA-z0-9\s.]' 
    text = re.sub(pattern, '', text)
    return text

## Data Analysis (Before Data Cleaning)

In [None]:
display(fake_news_df.sample(2), true_news_df.sample(2))

In [None]:
display(fake_news_df.shape, true_news_df.shape)

#### FAKE News

In [None]:
display(set(fake_news_df['subject']), set(true_news_df['subject']))

In [None]:
# Get the length of each instance
fake_news_df['len_sent'] = fake_news_df['text'].apply(lambda x: len_text(x)) 

In [None]:
fake_news_df['subject'].value_counts()

In [None]:
# Analyse the description of each group in the subject feature
fake_news_df.groupby(['subject']).describe()

In [None]:
# random.seed(123)
# fake_news_df[fake_news_df['subject']=='politics']['text'].sample(2, random_state=123).to_list()

##### Conclusion for Statistical analysis on Fake News Dataframe

- Maximum instances of possess by "News" with 9050 instances, followed by "politics" [6841], "left-news" [4459], "Govt News" [1570], "US_news" [783], and "Middle-east" [778]
- The news corpus is largely aligned towards the "left-news" and "politics" 
- The minimum number of text in Middle-east, News and US_News starts from 24, 36 and 24, respectively.
- There are various instances in Govt News, left-news and politics where the news is empty
- Many dirty records can be found in the data i.e. 
    - HTML characters/Code
    - White Spaces in the text
    - Removing Ascented Characters
    - Removing Special Characters

##### Cleaning Aspects

- Remove the instances with length less than 10
- Split the total instances by each group in same proportion

#### TRUE News

In [None]:
display(set(fake_news_df['subject']), set(true_news_df['subject']))

In [None]:
# Get the length of each instance
true_news_df['len_sent'] = true_news_df['text'].apply(lambda x: len_text(x)) 

In [None]:
true_news_df['subject'].value_counts()

In [None]:
# Analyse the description of each group in the subject feature
true_news_df.groupby(['subject']).describe()

##### Conclusion for Statistical analysis on True News Dataframe

- Maximum instances of possess by "politicsNews" with 11272 instances, followed by "worldnews" [10145]
- There are various instances in "politicsNews" where the news is empty

##### Cleaning Aspects

- Remove the instances with length less than 10

### Overall Conclusion
- Set minimum length of the instances to 20
- Make sure to remove bais in the data i.e. all group should possess same amount of instances (data)
- Combine the fields (if required) i.e. combining the "subjects" of the data 

## Data Analysis (After Data Cleaning)

#### FAKE News

In [None]:
fake_news_df.head(1)

In [None]:
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_html(x))
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_accented_chars(x))
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_special_characters(x))
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_white_space(x))
fake_news_df['len_sent'] = fake_news_df['text'].apply(lambda x: len_text(x)) 

In [None]:
fake_news_df.sample(1)

In [None]:
# Check the number of sentences below 10 word length
fake_news_df[fake_news_df['len_sent'] < 21].sort_values(by="len_sent",ascending=False)

In [None]:
fake_news_df = fake_news_df[fake_news_df['len_sent'] > 20].reset_index(drop=True)
display(fake_news_df.shape, fake_news_df.sample(2))

In [None]:
fake_news_df['subject'].value_counts()

In [None]:
# Analyse the description of each group in the subject feature
fake_news_df.groupby(['subject']).describe()

In [None]:
# Adding real_fake feature in order to identify the True or Fake News
fake_news_df['real_or_fake'] = 0

In [None]:
print("Shape of Fake News: ", fake_news_df.shape)

#### TRUE News

In [None]:
true_news_df['text'] = true_news_df['text'].apply(lambda x: remove_html(x))
true_news_df['text'] = true_news_df['text'].apply(lambda x: remove_accented_chars(x))
true_news_df['text'] = true_news_df['text'].apply(lambda x: remove_special_characters(x))
true_news_df['text'] = true_news_df['text'].apply(lambda x: remove_white_space(x))
true_news_df['len_sent'] = true_news_df['text'].apply(lambda x: len_text(x)) 

In [None]:
# Check the number of sentences below 10 word length
true_news_df[true_news_df['len_sent'] < 21].sort_values(by="len_sent",ascending=False)

In [None]:
# Analyse the description of each group in the subject feature
true_news_df.groupby(['subject']).describe()

In [None]:
true_news_df = true_news_df[true_news_df['len_sent'] > 20].reset_index(drop=True)
display(true_news_df.shape, true_news_df.sample(2))

In [None]:
true_news_df['subject'].value_counts()

In [None]:
# Analyse the description of each group in the subject feature
true_news_df.groupby(['subject']).describe()

In [None]:
# Adding real_fake feature in order to identify the True or Fake News
true_news_df['real_or_fake'] = 1

In [None]:
print("Shape of True News: ", true_news_df.shape)

In [None]:
# Make sure to have same number of news instances in both the dataframe
fake_news_df = fake_news_df.sample(true_news_df.shape[0]).reset_index(drop=True)
print("Shape of Fake News: ", fake_news_df.shape)

### Modelling Section - Pre-Requirement

In [None]:
final_df = pd.concat([fake_news_df,true_news_df])
final_df = final_df.sample(frac=1).reset_index(drop=True)
display(final_df.shape, final_df.sample(2))

In [None]:
### Converting text-information into list
news_text = final_df['text'].to_list()
len(news_text)

In [None]:
# Creating One-Hot encoding for the 'real_or_fake' feature
le = LabelEncoder()
oe = OneHotEncoder(sparse=True)

In [None]:
real_fake = oe.fit_transform(final_df.real_or_fake.values.reshape(-1,1))
real_fake

In [None]:
MAX_SEQUENCE_LENGTH = 250
MAX_NB_WORDS = 50000
EMBEDDING_DIM = 100
tokenizer =Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

In [None]:
tokenizer.fit_on_texts(final_df['text'].values)  #  to update the internal vocabulary for the texts list
word_index = tokenizer.word_index 
print('Found %s unique tokens.' % len(word_index))

In [None]:
# print("Word Index Sample: ", word_index)

![Word Index](Image/word_index.JPG)

In [None]:
news_text = tokenizer.texts_to_sequences(final_df['text'].values) # converting tokens of text corpus into a sequence of integers
news_text = pad_sequences(news_text, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', news_text.shape)

In [None]:
Y = pd.get_dummies(final_df['real_or_fake']).values
print('Shape of label tensor:', Y.shape)

In [None]:
X_train, X_test, y_train, y_test= train_test_split(news_text, Y, test_size=0.2, random_state=100)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
epochs = 1
batch_size = 32

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))