In [1]:
import numpy as np
import pandas as pd
import re
import string

In [2]:
ENGLISH_STOP_WORDS = frozenset([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"])


In [3]:
gloves = {}
with open('data/glove.6B.300d.txt') as file:
    for line in file.readlines():
        elem = line.split()
        gloves[elem[0]] = np.array([float(val) for val in elem[1:]])

In [111]:
def words(text):
    """
    Given a string, return a list of words normalized as follows.
    Split the string to make words first by using regex compile() function
    and string.punctuation + '0-9\\r\\t\\n]' to replace all those
    char with a space character.
    Split on space to get word list.
    Ignore words < 3 char long.
    Lowercase all words
    Remove English stop words
    """
    regex = "[" + string.punctuation + "0-9\\r\\t\\n]"
    text = re.sub(regex, " ", text.lower())
    words = [w for w in text.split() if len(w) > 2]  
    return [w for w in words if w not in ENGLISH_STOP_WORDS]

In [119]:
from doc2vec import get_text, filelist

def load_articles(articles_dirname, gloves):
    article_info = []
    for file in filelist(articles_dirname):
        title, *text = get_text(file).split('\n')
        text = '\n'.join(text[1:-1])
        word_vecs = np.array([gloves[word] for word in words(text) if word in gloves])
        centroid = np.mean(word_vecs, axis=0)
        article_info.append((file, title, text, centroid))
    return article_info

In [18]:
import codecs
import os
import re
import string

def load_glove(filename):
    """
    Read all lines from the indicated file and return a dictionary
    mapping word:vector where vectors are of numpy `array` type.
    GloVe file lines are of the form:

    the 0.418 0.24968 -0.41242 0.1217 ...

    So split each line on spaces into a list; the first element is the word
    and the remaining elements represent factor components. The length of the vector
    should not matter; read vectors of any length.
    """
    gloves = {}
    with open(filename) as file:
        for line in file.readlines():
            elem = line.split()
            gloves[elem[0]] = np.array([float(val) for val in elem[1:]])
    return gloves


def filelist(root):
    """Return a fully-qualified list of filenames under root directory"""
    allfiles = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            allfiles.append(os.path.join(path, name))
    return allfiles


def get_text(filename):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding="latin-1", mode="r")
    s = f.read()
    f.close()
    return s


def words(text):
    """
    Given a string, return a list of words normalized as follows.
    Split the string to make words first by using regex compile() function
    and string.punctuation + '0-9\\r\\t\\n]' to replace all those
    char with a space character.
    Split on space to get word list.
    Ignore words < 3 char long.
    Lowercase all words
    Remove English stop words
    """
    regex = "[" + string.punctuation + "0-9\\r\\t\\n]"
    text = re.sub(regex, " ", text.lower())
    words = [w for w in text.split() if len(w) > 2]
    return [w for w in words if w not in ENGLISH_STOP_WORDS]


def doc2vec(text, gloves):
    """
    Return the word vector centroid for the text. Sum the word vectors
    for each word and then divide by the number of words. Ignore words
    not in gloves.
    """
    word_vecs = np.array([gloves[w] for w in words(text) if w in gloves])
    return np.mean(word_vecs, axis=0)


def load_articles(articles_dirname, gloves):
    """
    Load all .txt files under articles_dirname and return a table (list of lists/tuples)
    where each record is a list of:

      [filename, title, article-text-minus-title, wordvec-centroid-for-article-text]

    We use gloves parameter to compute the word vectors and centroid.

    The filename is fully-qualified name of the text file including
    the path to the root of the corpus passed in on the command line.

    When computing the vector for each document, use just the text, not the text and title.
    """
    article_info = []
    for file in filelist(articles_dirname):
        if not file.endswith('README.TXT'):
            title, *text = get_text(file).split("\n")
            text = "\n".join(text).strip("\n")
            centroid = doc2vec(text, gloves)
            article_info.append((file, title, text, centroid))
    return article_info


def distances(article, articles):
    """
    Compute the euclidean distance from article to every other article and return
    a list of (distance, a) tuples for all a in articles. The article is one
    of the elements (tuple) from the articles list.
    """
    target_filename, target_centroid = article[0], article[3]
    distances_list = []
    for article_info in articles:
        filename, centroid = article_info[0], article_info[3]
        if filename != target_filename:
            distances_list.append((
                np.linalg.norm(centroid - target_centroid),
                article_info
            ))
    return distances_list


def recommended(article, articles, n):
    """
    Return a list of the n articles (records with filename, title, etc...)
    closest to article's word vector centroid. The article is one of the elements
    (tuple) from the articles list.
    """
    distances_list = distances(article, articles)
    return [elem[1] for elem in sorted(distances_list, key=lambda x: x[0])][:n]


In [5]:
gloves = load_glove('data/glove.6B.300d.txt')

In [6]:
articles = load_articles('data/bbc', gloves)

In [14]:
articles[1863][1]

'Google launches TV search service'

In [20]:
original_centroid = articles[1863][-1]
comp_dist = lambda centroid: np.linalg.norm(centroid - original_centroid)

[(x[0], x[1], round(comp_dist(x[-1]),3)) for x in recommended(articles[1863], articles, 10)]

[('data/bbc/tech/010.txt', "Google's toolbar sparks concern", 1.042),
 ('data/bbc/tech/011.txt', 'UK net users leading TV downloads', 1.054),
 ('data/bbc/tech/053.txt', 'Microsoft launches its own search', 1.104),
 ('data/bbc/tech/213.txt', 'Search sites get closer to users', 1.133),
 ('data/bbc/tech/267.txt', 'The year search became personal', 1.151),
 ('data/bbc/tech/108.txt', 'Search wars hit desktop PCs', 1.166),
 ('data/bbc/tech/269.txt', 'Yahoo moves into desktop search', 1.18),
 ('data/bbc/tech/186.txt', "China 'blocks Google news site'", 1.189),
 ('data/bbc/tech/394.txt', "TV's future down the phone line", 1.192),
 ('data/bbc/tech/216.txt', 'TV future in the hands of viewers', 1.2)]

In [11]:
for i, article in enumerate(articles):
    if article[0] == 'data/bbc/tech/076.txt':
        print(i)

1863


In [10]:
articles[0][0]

'data/bbc/entertainment/289.txt'