In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Loading Datasets
truedf=pd.read_csv('../input/fake-news-detection/True.csv')
fakedf=pd.read_csv('../input/fake-news-detection/Fake.csv')

In [None]:
truedf.info()

In [None]:
fakedf.info()

In [None]:
truedf.head()

In [None]:
fakedf.head()

In [None]:
truedf['label']=1
fakedf['label']=0

In [None]:
#check for null values in True news dataframe
truedf.isnull().any()

In [None]:
#check for null values in Fake news dataframe
fakedf.isnull().any()

In [None]:
#Combining to one dataset
df=pd.concat([truedf,fakedf]).reset_index(drop=True)

In [None]:
df.info()

In [None]:
#To check for Data Imbalance and see distribution of real and fake news
fig, ax = plt.subplots(1,2, figsize=(19, 5))
g1 = sns.countplot(df['label'],ax=ax[0],palette="Set3");
g1.set_title("Count of Real and Fake data")
g1.set_ylabel("Count")
g1.set_xlabel("Label")
g2 = plt.pie(df["label"].value_counts().values,explode=[0,0],labels=df['label'].value_counts().index, autopct='%1.1f%%',colors=['SkyBlue','LightPink'])
fig.show()

In [None]:
#Removing unnecessary columns and combining title and text
df.drop(columns=['date'],inplace=True)
df['titletext']=df['title']+' '+df['text']
df.head()

In [None]:
import gensim
from gensim.utils import simple_preprocess

In [None]:
#Data Cleaning
def dclean(data):
    sw=stopwords.words('english')
    res=[]
    for word in gensim.utils.simple_preprocess(data):
        if word not in sw and len(word)>3:
            res.append(word)
    return res

In [None]:
df['titletext']=df['titletext'].apply(dclean)

In [None]:
df.head()

In [None]:
df['words'] = df['titletext'].apply(lambda x: " ".join(x))
df.head()

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(df['words'],df['label'],test_size=0.3,random_state=42)

In [None]:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model
import tensorflow as tf

In [None]:
# Obtain the total words present in the dataset
list_of_words = []
for news in df.titletext:
    for word in news:
        list_of_words.append(word)
totalwords=len(set(list_of_words))
totalwords

In [None]:
#Tokenising words
tz = Tokenizer(num_words = totalwords)
tz.fit_on_texts(xtrain)
trainseq=tz.texts_to_sequences(xtrain)
testseq=tz.texts_to_sequences(xtest)

In [None]:
ptrainseq=pad_sequences(trainseq,maxlen=400,padding = 'post',truncating='post')
ptestseq=pad_sequences(testseq,maxlen=400,truncating='post') 

In [None]:
ytrain = np.asarray(ytrain)
ytest = np.asarray(ytest)

In [None]:
trainx=tf.convert_to_tensor(ptrainseq)
trainy=tf.convert_to_tensor(ytrain)
testx=tf.convert_to_tensor(ptestseq)
testy=tf.convert_to_tensor(ytest)

In [None]:
model = Sequential()
#Embeddidng layer
model.add(Embedding(totalwords, output_dim = 128))
#LSTM
model.add(LSTM(units=128,dropout=0.2))
#Dense layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
with tf.device('/GPU:0'):
    history = model.fit(trainx, trainy, validation_split=0.3, epochs=10, batch_size=64, shuffle=True, verbose = 1)

In [None]:
print("Training Data Accuracy is: " , model.evaluate(ptrainseq,ytrain)[1]*100 , "%")
print("Testing Data Accuracy is: " , model.evaluate(ptestseq,ytest)[1]*100 , "%")

In [None]:
plt.figure()
plt.plot(history.history["acc"], label = "Train")
plt.plot(history.history["val_acc"], label = "Test")
plt.title("Accuracy")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(history.history["loss"], label = "Train")
plt.plot(history.history["val_loss"], label = "Test")
plt.title("Loss")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
model.save('fakenewsmodel.h5')