In [222]:
import csv

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

In [80]:
# Using the same list of stopwords as the authors of the paper.
# This list doesn't exactly match sklearn's.
STOP_WORDS = [
    "a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along",
    "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be",
    "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "co",
    "con", "could", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight",
    "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for",
    "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had",
    "has", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
    "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest",
    "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made",
    "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much",
    "must", "my", "myself", "name", "namely", "neither", "nevertheless", "next", "nine", "nobody", "now", "nowhere",
    "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours",
    "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see",
    "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some",
    "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take",
    "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though",
    "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve",
    "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what",
    "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon",
    "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will",
    "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"
]

VOCABULARY_SIZE = 5000


In [171]:
# Helper functions to pull data from the disk into memory.
# Caveats: some headers are defined for multiple labels. This code will
# ensure that only one instance for each body and header is kept in memory.


def get_or_put(list_, value):
    """Inserts value in list_ if not preset, returns the index of value."""
    try:
        return list_.index(value)
    except ValueError:
        list_.append(value)
        return len(list_) - 1

    
def read_stance(filepath, has_labels=True):
    """Reads the stances present on the filepath csv file.
    
    :return (a,b) where a is a list of dicts that describe each sample and b
    is a list of headlines. The body_id on the elements of a correspond to the
    text on the i-th element of b.
    """
    samples = list()
    headlines = list()
    
    with open(filepath, 'r') as file:
        reader = csv.DictReader(file)
        for line in reader:
            headline = line['Headline']
            hid = get_or_put(headlines, headline)
            
            node = {
                'headline': hid,
                'body_id': int(line['Body ID']),                
            }
            
            if has_labels:
                node['label'] = line['Stance']
            
            samples.append(node)
            
    return samples, headlines


def read_bodies(filepath):
    """Produces a dict mapping body ids to the text on the filepath csv file."""
    ordered_bodies = list()
    bodies_index = dict()
    
    with open(filepath, 'r') as file:
        reader = csv.DictReader(file)
        for line in reader:
            body_id = int(line['Body ID'])
            body = line['articleBody']

            if body_id not in bodies_index:
                ordered_bodies.append(body)
                bodies_index[body_id] = len(ordered_bodies) - 1
                
    return ordered_bodies, bodies_index


In [173]:
# Loading training data

def show_statistics(amount_samples, amount_heads, amount_bodies, fold):
    print('Data statistics for {} split:'.format(fold))
    print(' - Amount of samples:   {}'.format(amount_samples))
    print(' - Amount of headlines: {}'.format(amount_heads))
    print(' - Amount of bodies:    {}'.format(amount_bodies))


train_samples, train_headlines = read_stance('../fakenewschallenge/train_stances.csv')
train_bodies, train_bodies_map = read_bodies('../fakenewschallenge/train_bodies.csv')

test_samples, test_headlines = read_stance('../fakenewschallenge/test_stances_unlabeled.csv', False)
test_bodies, test_bodies_map = read_bodies('../fakenewschallenge/test_bodies.csv')

print()
show_statistics(len(train_samples), len(train_headlines), len(train_bodies), 'train')
print()
show_statistics(len(test_samples), len(test_headlines), len(test_bodies), 'test')



Data statistics for train split:
 - Amount of samples:   49972
 - Amount of headlines: 1648
 - Amount of bodies:    1683

Data statistics for test split:
 - Amount of samples:   25413
 - Amount of headlines: 894
 - Amount of bodies:    904


In [175]:
# Fitting vectorizers to build feature vectors for each sample

print('Fitting and transforming TF vectorizer.')
all_train_texts = train_headlines + train_bodies

tf_vectorizer = TfidfVectorizer(max_features=VOCABULARY_SIZE, stop_words=STOP_WORDS, use_idf=False)
all_tfs = tf_vectorizer.fit_transform(all_train_texts)

# WARN: Using test data for training LOL. But I'm just replicating the paper's implementation
print('Fitting and transforming TF-IDF vectorizer.')
all_texts = all_train_texts + test_headlines + test_bodies
tfidf_transformer = TfidfVectorizer(max_features=VOCABULARY_SIZE, stop_words=STOP_WORDS)
_ = tfidf_transformer.fit(all_texts)

# Points to the first headline on the list of TFs computed when fitting the vectorizer.
first_headline = len(train_headlines)

Fitting and transforming TF vectorizer.


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Fitting and transforming TF-IDF vectorizer.


In [270]:

def build_vector(tf_head, tf_body, similarity):        
    return np.concatenate((tf_head.toarray(), tf_body.toarray(), similarity), axis=1)[0]


# Building the feature vectors for samples
feature_samples = list()
for y, sample in enumerate(train_samples):
    headline_id = sample['headline']
    body_id     = sample['body_id']
    
    body_index = train_bodies_map[body_id]

    tf_headline = all_tfs[headline_id]
    tf_body     = all_tfs[first_headline + body_index]
    
    tf_idf_head = tfidf_transformer.transform([train_headlines[headline_id]])
    tf_idf_body = tfidf_transformer.transform([train_bodies[body_index]])
    
    s = cosine_similarity(tf_idf_head, tf_idf_body, dense_output=True)
    
    feature_samples.append(build_vector(tf_headline, tf_body, s))


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
