Below is the model for Fake News detection over the Buzzfeed-Webis Fake News Corpus 2016. 

In [261]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
import numpy as np

import xml.etree.ElementTree as ET
import os
from gensim.models import Word2Vec, KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


The first step is to download the Buzzfeed-Webis corpus, which is provided in the form of XML files. read_files will read each file and parse the XML tree to retrieve a tuple of the body of the text ('mainText') and the veracity label ('veracity')

In [147]:
def read_files(cols):
    """
    For each xml file return a matrix of values asked for
    """
    path = 'data/train/'
    possibilities = ['mixture of true and false', 'mostly false', 'no factual content', 'mostly true']
    for filename in os.listdir(path):
        data_row = []
        if not filename.endswith('.xml'): continue
        xmlfile = os.path.join(path, filename)
        tree = ET.parse(xmlfile)
        if cols == "mainText":
            if tree.find("mainText").text:
                yield tree.find("mainText").text
            else:
                yield ''
        elif cols == "veracity":
            v = possibilities.index(tree.find("veracity").text)
            yield v
        else:
            for col in cols:
                try:
                    data_row.append(int(tree.find(col).text))
                except:
                    data_row.append(0)
            yield data_row

We call this function to get a list of the main text of each article ('documents') as well as a matching list of the labels ('predictions')

In [201]:
def feature_matrix(cols):
    data = []
    for row in read_files(cols):
        data.append(row)
    return np.array(data)

def get_document_text():
    data = []
    for row in read_files("mainText"):
        data.append(row)
    return data

def get_veracity():
    data = []
    for row in read_files("veracity"):
        data.append(row)
    return data

documents = get_document_text()
predictions = get_veracity()

Now we load the Google News pre-trained word embeddings for use in our model. These embeddings are trained using a combination of CBOW and skip-grams over a corpus of over 100 billion words from Google News.  

In [149]:
file = 'data/GoogleNews-vectors-negative300.bin'
embeddings = KeyedVectors.load_word2vec_format(file, binary=True)

To represent entire articles using the Google News word embeddings, we replace each string with it's matching embedding and then taken the elementwise mean, max or min of the entire document. This takes a document of N words from being N separate vectors to being a single 1D vector (300 by 1)

In [150]:
# Note: Is there a more efficient way to do this? Looping might be slow for long articles
def avg_docvec(docText,embeddings):
    """
    This function converts the text of a document (input as a string) to word embeddings, then
    takes the elementwise average of the embeddings to return a single vector.
    """
    docVec = np.zeros(300) # Initialize array for the document
    tokens = word_tokenize(docText) # Creates a list of word tokens (e.g. "Test words" -> ['Test', 'words'])
    denominator = 0.0 # To take the average, will only count tokens for which we have embeddings in the total  
    for token in tokens:
        try:
            v = embeddings[token]
            np.add(docVec,v,out=docVec)
            denominator += 1.0
        except: # Ignore tokens that aren't in the Google News embeddings
            continue
    np.divide(docVec,denominator,out=docVec) 
    return docVec

In [151]:
def max_docvec(docText,embeddings):
    """
    Converts the text of a document (input as a string) to word embeddings, then takes the elementwise
    max of the embeddings to return a single vector of the maximum elements.
    """
    docVec = 0
    tokens = word_tokenize(docText) # Creates a list of word tokens (e.g. "Test words" -> ['Test', 'words'])
    startIndex = 0
    for i in range(len(tokens)): # Initialize the doc vec as the first token that is in the embeddings
        try:
            v = embeddings[tokens[i]]
            docVec = v
            startIndex = i
            break
        except:
            continue
    
    for token in tokens[startIndex:]:
        try:
            v = embeddings[token]
            np.max(docVec,v,out=docVec)
        except: # Ignore tokens that aren't in the Google News embeddings
            continue
    return docVec

In [152]:
def min_docvec(docText,embeddings):
    """
    Converts the text of a document (input as a string) to word embeddings, then takes the elementwise
    min of the embeddings to return a single vector of the minimum elements.
    """
    docVec = 0
    tokens = word_tokenize(docText) # Creates a list of word tokens (e.g. "Test words" -> ['Test', 'words'])
    startIndex = 0
    for i in range(len(tokens)): # Initialize the doc vec as the first token that is in the embeddings
        try:
            v = embeddings[tokens[i]]
            docVec = v
            startIndex = i
            break
        except:
            continue
    
    for token in tokens[startIndex:]: # Loop over words in the article, starting at first valid word
        try:
            v = embeddings[token]
            np.min(docVec,v,out=docVec) # Only keep min elements
        except: # Ignore tokens that aren't in the Google News embeddings
            continue
    return docVec

In [291]:
def docs_to_matrix(documents,embeddings,method='avg'):
    """
    Takes a list of document text strings and returns a matrix of document embeddings.
    The method specifies how the word vectors are combined for the document: average is 
    element-wise average, min is element-wise min and max is element-wise max. 
    """
    matrix = []
    for i in range(len(documents)):
        vector = 0
        if method.lower() == 'avg':
            vector = avg_docvec(documents[i],embeddings)
        elif method.lower() == 'min':
            vector = min_docvec(documents[i],embeddings)
        elif method.lower() == 'max':
            vector = max_docvec(documents[i],embeddings)
        else:
            print("Please enter method argument as min, max or avg")
            return
        if i == 0:
            matrix = vector
        else:
            matrix = np.column_stack((matrix,vector)) # Concat all vectors into a matrix of order (300,N of docs)
    matrix = matrix.reshape((len(documents),300)) # For sklearn, reshape the matrix into order (N of docs,300), so rows = docs
    return matrix

articles_matrix = docs_to_matrix(documents,embeddings,'avg')        

Here we make the training and test datasets using the article representations and predictions. Then we create a logistic regression object, fit the model on the training data and check it on the test info.

#### Notes on testing: 
* A majority classifier gets outcome Precision = 0.62, Recall = 0.78, F = 0.69. Unregularized tf-idf acts as a majority classifier.
* Using regularized tf-idf logreg (C = 100), Precision/Recall/F =  0.82, 0.80, 0.73. This is no longer a majority classifier. 
* Representing articles as elementwise average of the word embeddings does not change the model from being a majority classifier, even when using extreme regularization (C = .0001). This applies for min and max document vectors as well. 

In [292]:
# # Use the TF-IDF representation of each article as a feature matrix
# sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
# articles_matrix = sklearn_tfidf.fit_transform(documents)
# X_train, X_test, y_train, y_test = train_test_split(articles_matrix, predictions, test_size = .3, random_state=25)

# Use the combination of pre-trained word embeddings in each article as a feature matrix
X_train, X_test, y_train, y_test = train_test_split(articles_matrix, predictions, test_size = .3, random_state=25)

In [314]:
# Logistic Regression
# Issue is that the model is optimizing towards accuracy, which biases it to majority class. Need different metric

# logreg = LogisticRegression(penalty='l2')
# params = {'C':[.00001,.0001,.001,.01,.1,.2,.5,.8,1,2,5,10,100]} # Dict of values to search over for regularization strength
# best_logreg = GridSearchCV(logreg,params,scoring='accuracy')
# best_logreg.fit(X_train,y_train)
# print(best_logreg.best_score_)
# logreg = best_logreg.best_estimator_
# print(logreg)

logreg = LogisticRegressionCV(penalty='l2', scoring="f1",Cs=[.00001,.0001,.001,.01,.1,.2,.5,.8,1,2,5,10,100,1000])
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)
print(y_pred)

  'precision', 'predicted', average, warn_for)


[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 1 3 3 1 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 0 3
 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 0 3 3 3 3 3 3
 1 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 0 3 3 3 3 3 2 3
 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 0 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3
 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 1 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3]


#### Model Evaluation: 
Here we evaluate the effectiveness of our model using a confusion matrix, precision, recall and F1-score

In [315]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)
print(classification_report(y_test, y_pred))

[[  3   0   1  53]
 [  2   4   0  24]
 [  0   2   0  15]
 [ 14   5   4 355]]
             precision    recall  f1-score   support

          0       0.16      0.05      0.08        57
          1       0.36      0.13      0.20        30
          2       0.00      0.00      0.00        17
          3       0.79      0.94      0.86       378

avg / total       0.66      0.75      0.70       482



In [262]:
CNN Implementation 1

# fix random seed for reproducibility
np.random.seed(7)

features = feature_matrix([
    "number_of_quotes",
    "number_of_links"
])

#Calculate TF-IDF over the main text of each article, creating vector representations of them
tokenize = lambda doc: doc.lower().split(" ")
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
sklearn_representation = sklearn_tfidf.fit_transform(documents)

X_train, X_test, y_train, y_test = train_test_split(sklearn_representation, predictions, test_size = 0, random_state=25)


X = np.concatenate((X_train.todense() ,features), axis=1)
Y = np.reshape(y_train, (len(y_train), ))
dim = X.shape

# model
model = Sequential()
model.add(Dense(12, input_dim=dim[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model
model.fit(X, Y, epochs=70, batch_size=10)

# evaluate the model
scores = model.evaluate(X, Y)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [327]:
# CNN - Second Implementation (Medium post)
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Input, Conv1D, MaxPooling1D
from keras.utils import to_categorical
from keras import backend as K
K.set_image_dim_ordering('tf')

    

In [328]:
# Define hyperparameters
MAX_SEQUENCE_LENGTH = 700 # Semi-arbitrary, not sure how to tune this. Articles range from 6 - 6500 words with mean of 659
VALIDATION_SPLIT = 0.3 
EMBEDDING_DIM = 300 # Google News embeddings are 300 dimensional

# Prepare tokenizer
t = Tokenizer()
t.fit_on_texts(documents)
vocab_size = len(t.word_index) + 1

# integer encode the documents
encoded_docs = t.texts_to_sequences(documents)

# pad our doc sequences to a max length of 700 words
data = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Re-using the list of integer labels generated earlier, make a binary class matrix
labels = to_categorical(np.asarray(predictions))

# Split into train and test sets - I try 2 versions, one from Keras and one using sklearn functions

## sklearn version of train and test split
# x_train, x_test, y_train, y_test = train_test_split(data, predictions, test_size = 0.3, random_state=25) 

# Keras blog post version of train-test split
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])


# Note I skip a step here - embeddings already loaded into memory as KeyedVectors above

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = None
    try:
        embedding_vector = embeddings[word] # Get the vector for a given word
    except:
        embedding_vector = None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector        

In [329]:
# Create the embedding layer from the embedding matrix
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Create the model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# Train the model
model.fit(x_train, y_train, validation_data=(x_test, y_test),
          epochs=2, batch_size=128)



ValueError: Negative dimension size caused by subtracting 35 from 23 for 'max_pooling1d_21/MaxPool' (op: 'MaxPool') with input shapes: [?,23,1,128].