In [1]:
import random
import pickle

import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# text_df = pd.read_csv('Downloads/fake_or_real_news.csv')
text_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/fake_or_real_news.csv')
text_df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
text = list(text_df.text.values)
joined_text = " ".join(text) 

In [5]:
partial_text = joined_text[:1000000] 

In [6]:
# let's tokenize it:
tokenizer = RegexpTokenizer(r"\w+") 
tokens = tokenizer.tokenize(partial_text.lower()) 


In [None]:
tokens 

In [None]:
unique_tokens = np.unique(tokens)

# not we wanna a dic to match tokens with indexs:
unique_tokens_index = {token: idx for idx, token in enumerate(unique_tokens)}
unique_tokens_index

Now, how many words does the AI look at to predict the next word?

In [9]:
n_words = 10
input_words = []
next_words = []

for i in range(len(tokens) - n_words): 
  input_words.append(tokens[i:i + n_words]) 
  next_words.append(tokens[i + n_words]) 

In [None]:
input_words

In [None]:
next_words

In [12]:
# sparse array
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)

In [None]:
pd.DataFrame(Y)

In [14]:
# adding our words to the sparse matrix
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_tokens_index[word]] = 1
    Y[i, unique_tokens_index[next_words[i]]] = 1

In [15]:
pd.DataFrame(Y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14379,14380,14381,14382,14383,14384,14385,14386,14387,14388
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169672,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
169673,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
169674,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
169675,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Let's train the model


In [16]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True)) 
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))


  super().__init__(**kwargs)


In [17]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
history = model.fit(X, Y, batch_size=128, epochs=5, shuffle=True).history

In [None]:
model.save("text_gen_model2.h5")
with open("history2.p", "wb") as f:
    pickle.dump(history, f)

In [None]:
model = load_model("text_gen_model2.h5")
history = pickle.load(open("history2.p", "rb"))

In [None]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, unique_token_index[word]] = 1

    predictions = model.predict(X)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [None]:
possible = predict_next_word("I will have to look into this thing because I", 5)

In [None]:
for idx in possible:
    print(unique_tokens[idx])

In [3]:
def generate_text(input_text, n_words, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(n_words):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [None]:
generate_text("I will have to look into this thing because I", 100, 10)

In [None]:
for idx in predict_next_word("The president will most likely not be there to help", 5):
    print(unique_tokens[idx])