Below is the model for Fake News detection over the Buzzfeed-Webis Fake News Corpus 2016. 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
import numpy as np

import xml.etree.ElementTree as ET
import os
from gensim.models import Word2Vec, KeyedVectors

The first step is to download the Buzzfeed-Webis corpus, which is provided in the form of XML files. read_files will read each file and parse the XML tree to retrieve a tuple of the body of the text ('mainText') and the veracity label ('veracity')

In [None]:
def read_files():
    """
    For each xml file return the main text and the veracity label
    """
    path = 'data/train/'
    for filename in os.listdir(path):
        if not filename.endswith('.xml'): continue
        xmlfile = os.path.join(path, filename)
        tree = ET.parse(xmlfile)
        yield (tree.find('mainText').text, tree.find('veracity').text)

tokenize = lambda doc: doc.lower().split(" ") # I use the NLTK version, maybe should take this out?

We call this function to get a list of the main text of each article ('documents') as well as a matching list of the labels ('predictions')

In [18]:
documents = [f[0] for f in read_files() if f[0] is not None]
possibilities = ['mixture of true and false', 'mostly false', 'no factual content', 'mostly true']
predictions = [possibilities.index(f[1]) for f in read_files() if f[0] is not None]

Now we load the Google News pre-trained word embeddings for use in our model. These embeddings are trained using a combination of CBOW and skip-grams over a corpus of over 100 billion words from Google News.  

In [3]:
file = 'data/GoogleNews-vectors-negative300.bin'
embeddings = KeyedVectors.load_word2vec_format(file, binary=True)

array([-0.07910156,  0.12158203, -0.00842285,  0.30664062, -0.15429688,
        0.10742188,  0.08398438, -0.0267334 , -0.01831055,  0.1484375 ,
       -0.15429688, -0.14160156, -0.21679688,  0.11767578, -0.20605469,
        0.1796875 ,  0.42578125,  0.07128906,  0.20117188, -0.19628906,
        0.04956055,  0.05932617, -0.09375   ,  0.20898438,  0.01696777,
        0.01385498, -0.37109375, -0.00872803, -0.03491211, -0.03320312,
        0.10742188, -0.01879883, -0.390625  ,  0.26757812, -0.12158203,
        0.08300781, -0.02612305,  0.34960938,  0.12890625,  0.28515625,
        0.359375  , -0.01104736,  0.05126953,  0.08300781,  0.05322266,
       -0.17285156,  0.14941406,  0.23925781,  0.56640625,  0.19824219,
       -0.30078125,  0.17480469, -0.11328125, -0.25976562, -0.31054688,
       -0.03088379, -0.49609375, -0.30664062,  0.07763672, -0.21777344,
        0.27539062,  0.15039062,  0.22949219,  0.30859375,  0.08154297,
       -0.03613281, -0.26953125,  0.14160156, -0.19921875,  0.04

To represent entire articles using the Google News word embeddings, we replace each string with it's matching embedding and then taken the elementwise mean of the entire document. This takes a document of N words from being N separate vectors to being a single 1D vector. 

In [20]:
# Note: Is there a more efficient way to do this? Looping might be slow for long articles
def avg_docvec(docText,embeddings):
    """
    This function converts the text of a document (input as a string) to word embeddings, then
    takes the elementwise average of the embeddings to return a single vector.
    """
    docVec = np.zeros(300) # Initialize array for the document
    tokens = word_tokenize(doctex) # Creates a list of word tokens (e.g. "Test words" -> ['Test', 'words'])
    denominator = 0.0 # To take the average, will only count tokens for which we have embeddings in the total  
    for token in tokens:
        try:
            v = embeddings[token]
            np.add(docVec,v,out=docVec)
            denominator += 1.0
        except: # Ignore tokens that aren't in the Google News embeddings
            continue
    np.divide(docVec,denominator,out=docVec) 
    return docVec

# v = embeddings['Trump']
# docVec = np.zeros(300)
# np.add(docVec,v,out=docVec)
# np.divide(docVec,v,out=docVec)
# print(docVec)

In [None]:
def docs_to_matrix(documents,embeddings):
    """
    Takes a list of document text strings and returns a matrix of document embeddings.
    Note: Does sklearn take matrix inputs in the form (row=samples,columns=elements?)
    """
    matrix = []
    for i in range(len(documents)):
        vector = avg_docvec(documents[i],embeddings)
        if i == 0:
            matrix = vector
        else:
            matrix = np.concatenate((matrix,vector),axis=0) # Concat all vectors into a matrix of order (300,N of docs)
    return matrix

In [None]:
#Calculate TF-IDF over the main text of each article, creating a tf-idf matrix representation of all articles
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
sklearn_representation = sklearn_tfidf.fit_transform(documents)

In [None]:
# Splits data into training and test
X_train, X_test, y_train, y_test = train_test_split(sklearn_representation, predictions, test_size = .3, random_state=25)
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)
y_pred = LogReg.predict(X_test)
print(y_pred)