Below is the model for Fake News detection over the Buzzfeed-Webis Fake News Corpus 2016. 

In [1]:
import numpy as np
np.random.seed(1337) # Fix a random seed to make (sorta) reproducible results

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV # Tools for splitting data, tuning hyperparameters
from sklearn.linear_model import LogisticRegressionCV # Logreg model
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix # Evaluation tools
from nltk.tokenize import word_tokenize # Tokenizer


import xml.etree.ElementTree as ET
import os
from gensim.models import Word2Vec, KeyedVectors

# CNN - Second Implementation (Medium post)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Embedding, Input, Conv1D, MaxPooling1D, Dropout, LSTM
from keras.utils import to_categorical
from keras import backend as K
from keras.regularizers import l2
from keras.wrappers.scikit_learn import KerasClassifier # Wrapper to use Keras model in sklearn


Using TensorFlow backend.
  return f(*args, **kwds)


The first step is to download the Buzzfeed-Webis corpus, which is provided in the form of XML files. read_files will read each file and parse the XML tree to retrieve a tuple of the body of the text ('mainText') and the veracity label ('veracity')

In [8]:
possibilities = ['mixture of true and false', 'mostly false', 'mostly true']
# possibilities = ['mixture of true and false', 'mostly false', 'no factual content', 'mostly true']


def read_files(cols, orientation="all"):
    """
    For each xml file return a matrix of values asked for
    """
    path = 'data/train/'
    for filename in os.listdir(path):
        data_row = []
        if not filename.endswith('.xml'): continue
        xmlfile = os.path.join(path, filename)
        tree = ET.parse(xmlfile)

# FOR TESTING WHOLE DATASET, use below:        
#         if not tree.find("mainText").text: continue
#         if orientation != "all" and tree.find("orientation").text != orientation:
#             continue
        
#COMMENT IN THE FOLLOWING two if statements to test the partisan-only dataset
        if not tree.find("mainText").text or tree.find("veracity").text == "no factual content": continue
        if orientation == "all" and tree.find("orientation").text == 'mainstream':
            continue    
        if cols == "mainText":
            if tree.find("mainText").text:
                yield tree.find("mainText").text
            else:
                continue
        elif cols == "veracity":
            v = possibilities.index(tree.find("veracity").text)
            yield v
        elif cols == "both":
            if tree.find("mainText").text:
                v = possibilities.index(tree.find("veracity").text)
                yield tree.find("mainText").text, v
            else:
                continue
        else:
            for col in cols:
                try:
                    data_row.append(float(tree.find(col).text))
                except:
                    data_row.append(0.0)
            yield data_row

We call this function to get a list of the main text of each article ('documents') as well as a matching list of the labels ('predictions')

In [9]:
def feature_matrix(cols):
    data = []
    for row in read_files(cols):
        data.append(np.array(row))
    return np.array(data)

def get_document_text():
    data = []
    for row in read_files("mainText"):
        if not row:
            continue
        else:
            data.append(row)
    return data

def get_veracity():
    data = []
    for row in read_files("veracity"):
        data.append(row)
    return data

def get_document_text_and_veracity():
    docs, preds = [], []
    for row in read_files("both"):
        if not row[0]:
            continue
        else:
            docs.append(row[0])
            preds.append(row[1])
    return docs, preds

documents, predictions = get_document_text_and_veracity()

print(len(documents),predictions.count(0),predictions.count(1),predictions.count(2))
# maximum = 0
# minimum = 1000
# for doc in documents:
#     l = word_tokenize(doc)
#     if len(l) > maximum:
#         maximum = len(l)
#     if len(l) < minimum:
#         minimum = len(l)    

730 201 82 447


Now we load the Google News pre-trained word embeddings for use in our model. These embeddings are trained using a combination of CBOW and skip-grams over a corpus of over 100 billion words from Google News.  

In [7]:
file = 'data/GoogleNews-vectors-negative300.bin'
embeddings = KeyedVectors.load_word2vec_format(file, binary=True)

To represent entire articles using the Google News word embeddings, we replace each string with it's matching embedding and then taken the elementwise mean, max or min of the entire document. This takes a document of N words from being N separate vectors to being a single 1D vector (300 by 1)

In [5]:
def avg_docvec(docText,embeddings):
    """
    This function converts the text of a document (input as a string) to word embeddings, then
    takes the elementwise average of the embeddings to return a single vector.
    """
    docVec = np.zeros(300) # Initialize array for the document
    tokens = word_tokenize(docText) # Creates a list of word tokens (e.g. "Test words" -> ['Test', 'words'])
    denominator = 0.0 # To take the average, will only count tokens for which we have embeddings in the total  
    for token in tokens:
        try:
            v = embeddings[token]
            np.add(docVec,v,out=docVec)
            denominator += 1.0
        except: # Ignore tokens that aren't in the Google News embeddings
            continue
    np.divide(docVec,denominator,out=docVec) 
    return docVec

In [6]:
def max_docvec(docText,embeddings):
    """
    Converts the text of a document (input as a string) to word embeddings, then takes the elementwise
    max of the embeddings to return a single vector of the maximum elements.
    """
    docVec = 0
    tokens = word_tokenize(docText) # Creates a list of word tokens (e.g. "Test words" -> ['Test', 'words'])
    startIndex = 0
    for i in range(len(tokens)): # Initialize the doc vec as the first token that is in the embeddings
        try:
            v = embeddings[tokens[i]]
            docVec = v
            startIndex = i
            break
        except:
            continue
    
    for token in tokens[startIndex:]:
        try:
            v = embeddings[token]
            np.max(docVec,v,out=docVec)
        except: # Ignore tokens that aren't in the Google News embeddings
            continue
    return docVec

In [7]:
def min_docvec(docText,embeddings):
    """
    Converts the text of a document (input as a string) to word embeddings, then takes the elementwise
    min of the embeddings to return a single vector of the minimum elements.
    """
    docVec = 0
    tokens = word_tokenize(docText) # Creates a list of word tokens (e.g. "Test words" -> ['Test', 'words'])
    startIndex = 0
    for i in range(len(tokens)): # Initialize the doc vec as the first token that is in the embeddings
        try:
            v = embeddings[tokens[i]]
            docVec = v
            startIndex = i
            break
        except:
            continue
    for token in tokens[startIndex:]: # Loop over words in the article, starting at first valid word
        try:
            v = embeddings[token]
            np.min(docVec,v,out=docVec) # Only keep min elements
        except: # Ignore tokens that aren't in the Google News embeddings
            continue
    return docVec

In [38]:
def docs_to_matrix(documents,embeddings,method='avg'):
    """
    Takes a list of document text strings and returns a matrix of document embeddings.
    The method specifies how the word vectors are combined for the document: average is 
    element-wise average, min is element-wise min and max is element-wise max. 
    """
    matrix = []
    count = 0
    for i in range(len(documents)):
        vector = 0
        if method.lower() == 'avg':
            vector = avg_docvec(documents[i],embeddings)
        elif method.lower() == 'min':
            vector = min_docvec(documents[i],embeddings)
        elif method.lower() == 'max':
            vector = max_docvec(documents[i],embeddings)
        else:
            print("Please enter method argument as min, max or avg")
            return
        if i == 0:
            matrix = vector
        else:
            matrix = np.column_stack((matrix,vector)) # Concat all vectors into a matrix of order (300,N of docs)
            count += 1
    matrix = matrix.reshape((len(documents),300)) # For sklearn, reshape the matrix into order (N of docs,300), so rows = docs
    return matrix

articles_matrix = docs_to_matrix(documents,embeddings,'avg')     

1604


Here we make the training and test datasets using the article representations and predictions. Then we create a logistic regression object, fit the model on the training data and check it on the test info.

#### Notes on testing: 
* A majority classifier gets outcome Precision = 0.62, Recall = 0.78, F = 0.69. Unregularized tf-idf acts as a majority classifier.
* Using regularized tf-idf logreg (C = 100), Precision/Recall/F =  0.82, 0.80, 0.73. This is no longer a majority classifier. 
* Representing articles as elementwise average of the word embeddings does not change the model from being a majority classifier, even when using extreme regularization (C = .0001). This applies for min and max document vectors as well. 

In [39]:
# Use the TF-IDF representation of each article as a feature matrix
# sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=word_tokenize)
# articles_matrix = sklearn_tfidf.fit_transform(documents)
# X_train, X_test, y_train, y_test = train_test_split(articles_matrix, predictions, test_size = .3, random_state=25)

# Use the combination of pre-trained word embeddings in each article as a feature matrix
X_train, X_test, y_train, y_test = train_test_split(articles_matrix, predictions, test_size = .3, random_state=25)

In [40]:
# Logistic Regression
logreg = LogisticRegressionCV(penalty='l2', scoring="f1",Cs=[.00001,.0001,.001,.01,.1,.2,.5,.8,1,2,5,10,100,1000])
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)
print(y_pred)

  'precision', 'predicted', average, warn_for)


[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 1 3 3 1 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 0 3
 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 0 3 3 3 3 3 3
 1 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 0 3 3 3 3 3 2 3
 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 0 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3
 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 1 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3]


#### Model Evaluation: 
Here we evaluate the effectiveness of our model using a confusion matrix, precision, recall and F1-score

In [41]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test,y_pred))

[[  3   0   1  53]
 [  2   4   0  24]
 [  0   2   0  15]
 [ 14   5   4 355]]
             precision    recall  f1-score   support

          0       0.16      0.05      0.08        57
          1       0.36      0.13      0.20        30
          2       0.00      0.00      0.00        17
          3       0.79      0.94      0.86       378

avg / total       0.66      0.75      0.70       482

0.751037344398


In [61]:
# CNN Implementation based off Keras models
# Best for this seed? len=800,dim=300,dropout=0.5,filters=300,k=5 

# Define hyperparameters
MAX_SEQUENCE_LENGTH = 700
TEST_SPLIT = 0.2
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 300 # Google News embeddings are 300 dimensional 
DROPOUT = 0.5 # Dropout strength 
FILTERS = 300 # Number of filters in the convolutional layers
k = 6 # Sliding k window size for convolutional layers

# Prepare tokenizer
t = Tokenizer()
t.fit_on_texts(documents)
vocab_size = len(t.word_index) + 1

# integer encode the documents
encoded_docs = t.texts_to_sequences(documents)

# pad our doc sequences to a max length of MAX_SEQUENCE_LENGTH words
data = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Re-using the list of integer labels generated earlier, make a binary class matrix
labels = to_categorical(np.asarray(predictions))
possibilities = ['mixture of true and false', 'mostly false', 'no factual content', 'mostly true']
# Split into train and test sets - I try 2 versions, one from Keras and one using sklearn functions

# Split the full dataset into data used for training and the final test stage
x_training_set, x_final_test, y_training_set, y_final_test = train_test_split(data, labels, test_size = TEST_SPLIT, random_state=25) 

# Secondary split of training data into test and training
x_train, x_dev, y_train, y_dev = train_test_split(x_training_set, y_training_set, test_size = VALIDATION_SPLIT, random_state=17)

# create a weight matrix for words in training docs

embedding_weights = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = None
    try:
        embedding_vector = embeddings[word] # Get the vector for a given word
    except:
        embedding_vector = None
    if embedding_vector is not None:
        embedding_weights[i] = embedding_vector

In [62]:
# Create the embedding layer from the embedding matrix

def create_cnn(embedding_weights=embedding_weights,embedding_dim=300,max_sequence_len=800,filters=300,k=6,dropout=0.9):    
    embedding_layer = Embedding(vocab_size,
                                EMBEDDING_DIM,
                                weights=[embedding_weights],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    # Build the layers of the model
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 
    embedded_sequences = embedding_layer(sequence_input) 

    x = Conv1D(FILTERS, k, activation='relu')(embedded_sequences) 
    x = MaxPooling1D(k)(x)
    x = Dropout(DROPOUT)(x)
    x = Conv1D(FILTERS, k, activation='relu')(x)
    x = MaxPooling1D(k)(x)
    x = Dropout(DROPOUT)(x)
    x = Conv1D(FILTERS, k, activation='relu')(x)
    x = MaxPooling1D(int(x.shape[1]))(x)  # This layer pools the entire previous layer
    x = Flatten()(x)
    x = Dense(FILTERS, activation='relu')(x)
    preds = Dense(len(possibilities), activation='softmax')(x)
    
    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

# Train the model
# model = create_cnn(embedding_weights,EMBEDDING_DIM,MAX_SEQUENCE_LENGTH,FILTERS,k,DROPOUT)
# model.fit(x_train, y_train,epochs=1, batch_size=FILTERS)
# model.evaluate(x_final_test,y_final_test,verbose=0)

def make_classifications_list(binary_targets):
    """This is  for turning the output of Keras models into a list of class integers"""
    y_true = list(binary_targets)
    for i in range(len(y_true)):
        classification = None
        for j in range(y_true[i].shape[0]):
            if y_true[i][j] == 1.:
                classification = j
        y_true[i] = classification
    return y_true

model = create_cnn(embedding_weights,EMBEDDING_DIM,MAX_SEQUENCE_LENGTH,FILTERS,k,DROPOUT)
model.fit(x_train, y_train,epochs=1, batch_size=FILTERS)
y_prob = model.predict(x_final_test,batch_size=FILTERS)
y_pred = y_prob.argmax(axis=-1) # Get the predicted class (not probabilites of each)
ypred = list(y_pred) # Turn array into list
y_true = make_classifications_list(y_final_test) # Turn Matrix of targets into list
print(classification_report(y_true, y_pred))
print(accuracy_score(y_true,y_pred))
scores = model.evaluate(x_final_test,y_final_test,verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

[3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 1, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 0, 3, 3, 2, 3, 3, 3, 3, 3, 3, 0, 3, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 3, 3, 0, 3, 0, 1, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 3, 3, 0, 2, 3, 0, 3, 3, 3, 3, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, 2, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 0, 3, 3, 3, 0, 3, 3, 3, 3, 0, 3, 3, 1, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 1, 1, 3, 3, 3, 2, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3, 3]
Epoch 1/1


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        32
          1       0.00      0.00      0.00        18
          2       0.00      0.00      0.00        10
          3       0.81      1.00      0.90       261

avg / total       0.66      0.81      0.73       321

0.81308411215
Accuracy: 81.31%


In [196]:
# RNN Implementation Test Case
# Best results: 78.12% accuracy, epochs = 1, LSTM = 128, Dropout = 0.5, adam optimized

# Initialize embedding layer
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_weights],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 
embedded_sequences = embedding_layer(sequence_input) 

rnn = Sequential()
rnn.add(embedding_layer)
rnn.add(Dropout(0.5))
rnn.add(LSTM(128))
rnn.add(Dropout(0.5))
rnn.add(Dense(len(possibilities), activation='sigmoid'))
rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(rnn.summary())
rnn.fit(x_train, y_train, epochs=1, batch_size=64)

scores = rnn.evaluate(x_dev, y_dev, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_90 (Embedding)     (None, 700, 300)          11065200  
_________________________________________________________________
dropout_141 (Dropout)        (None, 700, 300)          0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dropout_142 (Dropout)        (None, 128)               0         
_________________________________________________________________
dense_147 (Dense)            (None, 4)                 516       
Total params: 11,285,364
Trainable params: 220,164
Non-trainable params: 11,065,200
_________________________________________________________________
None
Epoch 1/1
Accuracy: 81.31%
