In [1]:
import pandas as pd
import numpy as np
import pickle
import random
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import Adam

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
text_data = pd.read_csv('/content/drive/MyDrive/Colab-Notebooks/projects/Next Word Prediction/fake_or_real_news.csv')

In [4]:
text_data.tail()

Unnamed: 0,id,title,text,label
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL
6334,4330,Jeb Bush Is Suddenly Attacking Trump. Here's W...,Jeb Bush Is Suddenly Attacking Trump. Here's W...,REAL


In [5]:
text = list(text_data.text.values)
joined_text = " ".join(text)

In [6]:
partial_text = joined_text[:300000]

In [7]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(partial_text.lower())

In [8]:
#sample token
tokens[0:11]

['daniel',
 'greenfield',
 'a',
 'shillman',
 'journalism',
 'fellow',
 'at',
 'the',
 'freedom',
 'center',
 'is']

In [9]:
# collection all the unique tokens
unique_tokens = np.unique(tokens)
unique_token_index = {token: idx for idx, token in enumerate(unique_tokens)}

In [10]:
n_words = 10
input_words = []
next_words = []

for i in range(len(tokens) - n_words):
  input_words.append(tokens[i:i + n_words])
  next_words.append(tokens[i + n_words])

In [11]:
x = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)

In [12]:
y

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [13]:
for i, words in enumerate(input_words):
  for j, words in enumerate(words):
    x[i, j, unique_token_index[words]] = 1
  y[i, unique_token_index[next_words[i]]] = 1

In [14]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation('softmax'))

  super().__init__(**kwargs)


In [15]:
model.summary()

In [16]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

In [17]:
model.fit(x , y , batch_size=128 , epochs=30 , shuffle=True)

Epoch 1/30
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 20ms/step - accuracy: 0.0557 - loss: 7.4043
Epoch 2/30
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.0561 - loss: 6.9419
Epoch 3/30
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.0578 - loss: 6.8988
Epoch 4/30
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.0643 - loss: 6.8031
Epoch 5/30
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.0771 - loss: 6.5487
Epoch 6/30
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.0812 - loss: 6.5339
Epoch 7/30
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.1079 - loss: 6.1398
Epoch 8/30
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.1150 - loss: 5.8629
Epoch 9/30
[1m400/400[0m 

<keras.src.callbacks.history.History at 0x7eafc96ecad0>

In [18]:
def predict_next_word(input_text, n_best):
  input_text = input_text.lower()
  x = np.zeros((1, n_words, len(unique_tokens)))
  for i, word in enumerate(input_text.split()):
    x[0, i, unique_token_index[word]] = 1
  predictions = model.predict(x)[0]
  return np.argpartition(predictions, -n_best)[-n_best:]

In [19]:
possible = predict_next_word("daniel greenfield a shillman journalism fellow at the freedom center", 5)
print([unique_tokens[idx] for idx in possible])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 597ms/step
[np.str_('for'), np.str_('forces'), np.str_('after'), np.str_('perhaps'), np.str_('is')]


In [20]:
def generate_text(input_text, text_length, creativity=3):
  word_sequence = input_text.split()
  current = 0
  for _ in range(text_length):
    sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
    try:
      choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
    except:
      choice = random.choice(unique_tokens)
    word_sequence.append(choice)
    current += 1
  return " ".join(word_sequence)

In [21]:
generate_text("daniel greenfield a shillman journalism fellow at the freedom center" , 100 , 5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47

'daniel greenfield a shillman journalism fellow at the freedom center after democrats so will ready for my heart for there were a surreal candidate in their enough by broker international school with wednesday later from clinton happens won then humanity cannon became free incendiary since he served not in favor that this d have take listeners long was against it knows how hillary is colorado c to 11 bush and both parties so there were him all at this specifically told her rights fear about find in a private sense young care too young way that kerry also has avoid one of why senate as she has faced a involved'

In [22]:
model.save("next_word_model.h5")
with open("unique_tokens.pkl", "wb") as f:
    pickle.dump(unique_tokens, f)
with open("token_index.pkl", "wb") as f:
    pickle.dump(unique_token_index, f)

