In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM, Embedding
from tensorflow.keras.models import Model

In [6]:
df = pd.read_csv('news.csv')

In [7]:
df['text'] = df['title'] + ' ' + df['text']

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,You Can Smell Hillary’s Fear Daniel Greenfield...,FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,Kerry to go to Paris in gesture of sympathy U....,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,Bernie supporters on Twitter erupt in anger ag...,FAKE
4,875,The Battle of New York: Why This Primary Matters,The Battle of New York: Why This Primary Matte...,REAL


In [11]:
df.drop(columns =['Unnamed: 0', 'title'], inplace=True)

In [13]:
df['label'] = df['label'].map({'REAL':0, 'FAKE': 1})

In [15]:
Y = df['label'].values
X = df['text']

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.33)

In [17]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

In [18]:
word2idx = tokenizer.word_index
V = len(word2idx)
print('# of tokens:', V)

# of tokens: 80764


In [19]:
data_train = pad_sequences(sequences_train)
T = data_train.shape[1]
data_train.shape

(4244, 16944)

In [20]:
data_test = pad_sequences(sequences_test, maxlen =T)
data_test.shape

(2091, 16944)

In [21]:
D = 20

M = 15

i = Input(shape=(T,))
x = Embedding(V+1,D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation = 'sigmoid')(x)
model = Model(i,x)

In [None]:
model.compile(
    loss = 'binary_crossentropy',
    optimizer= 'adam',
    metrics = ['accuracy']
)
r = model.fit(
    data_train, Y_train,
    epochs=10,
    validation_data = (data_test, Y_test)
)

Epoch 1/10
  4/133 [..............................] - ETA: 1:44:50 - loss: 0.6936 - accuracy: 0.5000

In [None]:
plt.plot(r.history['loss'],label='loss')
plt.plot(r.history['val_loss'],label='val_loss')
plt.legend()

In [None]:
plt.plot(r.history['accuracy'],label='acc')
plt.plot(r.history['val_accuracy'],label='val_acc')
plt.legend()