In [1]:
import numpy as np
import pandas as pd
import json
import keras
import keras.preprocessing.text as kpt
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer

seed = 123
max_words = 5000 # only work with the 5000 most popular words found in our dataset

Using TensorFlow backend.


In [2]:
positive_df = pd.read_csv("radiocanada_small.csv", dtype='str')
negative_df = pd.read_csv("fakenews.csv", dtype='str')
print(positive_df.shape)
print(negative_df.shape)

X_positive = positive_df["text"]
y_positive = pd.DataFrame(np.ones(positive_df.shape[0]))

X_negative = negative_df["text"]
y_negative = pd.DataFrame(np.zeros(negative_df.shape[0]))

train_x = pd.concat([X_positive, X_negative], axis=0)
train_y = pd.concat([y_positive, y_negative], axis=0)

train_x = train_x.as_matrix().astype('str')

print("X Shape:", train_x.shape)
print("y shape:", train_y.shape)

(964, 7)
(752, 6)
X Shape: (1716,)
y shape: (1716, 1)


In [3]:
# create a new Tokenizer
tokenizer = Tokenizer(num_words=max_words)
# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(train_x)

# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index
# Let's save this out so we can use it later
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)


def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

allWordIndices = []
# for each tweet, change each token to its ID in the Tokenizer's word_index
for text in train_x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

# now we have a list of all tweets converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

In [4]:
# create one-hot matrices out of the indexed tweets
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
# treat the labels as categories
train_y = keras.utils.to_categorical(train_y, 2)

In [5]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [6]:
model.compile(loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy'])

In [7]:
model.fit(train_x, train_y,
  batch_size=32,
  epochs=10,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

Train on 1544 samples, validate on 172 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efe3cd1f748>

In [8]:
# Save model

model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model.h5')