Below is the model for Fake News detection over the Buzzfeed-Webis Fake News Corpus 2016. 

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import xml.etree.ElementTree as ET
import os
from gensim.models import Word2Vec, KeyedVectors

The first step is to download the Buzzfeed-Webis corpus, which is provided in the form of XML files. read_files will read each file and parse the XML tree to retrieve a tuple of the body of the text ('mainText') and the veracity label ('veracity')

In [4]:
def read_files():
    """
    For each xml file return the main text and the veracity label
    """
    path = 'data/train/'
    for filename in os.listdir(path):
        if not filename.endswith('.xml'): continue
        xmlfile = os.path.join(path, filename)
        tree = ET.parse(xmlfile)
        yield (tree.find('mainText').text, tree.find('veracity').text)

tokenize = lambda doc: doc.lower().split(" ")

We call this function to get a list of the main text of each article ('documents') as well as a matching list of the labels ('predictions')

In [5]:
documents = [f[0] for f in read_files() if f[0] is not None]
possibilities = ['mixture of true and false', 'mostly false', 'no factual content', 'mostly true']
predictions = [possibilities.index(f[1]) for f in read_files() if f[0] is not None]

Now we load the Google News pre-trained word embeddings for use in our model. These embeddings are trained using a combination of CBOW and skip-grams over a corpus of over 100 billion words from Google News.  

In [7]:
embeddings = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

array([ -2.55126953e-02,   7.75146484e-03,  -6.12792969e-02,
        -1.47705078e-02,   9.42382812e-02,  -1.07910156e-01,
         5.02929688e-02,   1.84570312e-01,   3.08593750e-01,
        -3.56445312e-02,   5.59082031e-02,  -1.61132812e-01,
        -2.73437500e-01,   1.00097656e-01,  -2.48046875e-01,
         8.93554688e-02,   1.13525391e-02,  -2.41699219e-02,
         5.15136719e-02,  -3.39843750e-01,  -1.58203125e-01,
        -2.14843750e-01,   4.64843750e-01,   3.92578125e-01,
        -7.41577148e-03,   3.06640625e-01,  -2.08984375e-01,
        -4.85839844e-02,   2.55859375e-01,  -2.48046875e-01,
        -9.42382812e-02,   1.94335938e-01,  -3.63281250e-01,
        -1.16699219e-01,   7.86132812e-02,   5.15136719e-02,
         1.76757812e-01,   4.34570312e-02,  -2.69531250e-01,
        -2.27539062e-01,   2.22656250e-01,  -1.78710938e-01,
         2.75390625e-01,   5.07812500e-01,  -3.22265625e-01,
        -2.22656250e-01,   1.91406250e-01,   2.43164062e-01,
         2.79541016e-02,

In [None]:
#Calculate TF-IDF over the main text of each article, creating vector representations of them
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
sklearn_representation = sklearn_tfidf.fit_transform(documents)

In [None]:
# Splits data into training and test
X_train, X_test, y_train, y_test = train_test_split(sklearn_representation, predictions, test_size = .3, random_state=25)
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)
y_pred = LogReg.predict(X_test)
print(y_pred)
