In [12]:
import numpy as np
import random
import matplotlib.pyplot as plt
import re
%matplotlib inline

# Read documents from collection
The documents in the Reuters collection are in SGML format. So first, we need to be able to read SGML files.

In [442]:
class TT:
    """Tag Tuple index names"""
    TAG = 0
    META = 1
    SUBTAGS = 2
    CONTENT = 3

def ReadDocumentsFromFile(path, preview=0):
    global parts
    global partsIndex

    def ParseSGML(returnTag=None, allowDuplicates=True, root=True):
        """Parses *parts* until returnTag or the end of *parts* reached; returns parsed content as a map"""
        global parts
        global partsIndex

        # Force allowDuplicates "True" since "False" isn't fully implemented
        allowDuplicates = True

        subTags = dict()
        content = ""

        while partsIndex < numParts:
            part = parts[partsIndex]
            partsIndex += 1

            # Remove excess whitespace
            part = " ".join(part.split())
            if len(part) == 0:
                continue

            # Process tags
            if part[0] == "<":
                assert(part[-1] == ">")

                # Ignore comments
                if part[1] == "!":
                    continue

                # Return contents when *returnTag* is reached
                if part[1] == "/":
                    tag = part[2:-1]
                    assert(tag == returnTag)
                    return (subTags, content)

                # Add new tags to the stack
                tagAndMeta = part[1:-1].split()
                tag = tagAndMeta[0]

                # Store key-value pairs associated with the tag (formatted "KEY1=VALUE1 KEY2=VALUE2 ...")
                tagMeta = dict([x.split("=") for x in tagAndMeta[1::]])

                # Collect content and tags within the current tag
                (tagSubTags, tagContent) = ParseSGML(
                    returnTag=tag,
                    allowDuplicates=False,
                    root=False
                )

                tagTuple = (
                    tag,
                    tagMeta,
                    tagSubTags,
                    tagContent
                )

                if allowDuplicates:
                    if tag not in subTags:
                        subTags[tag] = list()
                    subTags[tag].append(tagTuple)
                else:
                    subTags[tag] = tagTuple

            # Process text
            else:
                content += (" " if len(content) > 0 else "") + part

        return (subTags, content)


    # Create one big string
    document = ""
    with open(path, "r") as file:
        for i, line in enumerate(file):
            document += line.replace('\n', ' ')

            # Show the first several lines of the file
            if i < preview:
                print('%d: %s' % (i, line), end="")

    # Show the last line as well
    if preview:
        print("...\n%d: %s" % (i, line))
    
    parts = re.split("(<.*?>)", document)
    numParts = len(parts)
    partsIndex = 0
    allData = ParseSGML()
    
    return allData[0]["REUTERS"]

Let's look through the data directory for .sgm files and extract their contents. Each file is supposed to contain 1000 documents.

In [216]:
def GetDataFilePaths(directory):
    from os import listdir
    from os.path import isfile, join
    paths = [filePath for filePath in listdir(directory) if isfile(join(directory, filePath)) and filePath[-4:] == ".sgm"]
    return paths

In [443]:
sgmlDocuments = []
previewLines = 20

filePaths = GetDataFilePaths("reuters21578/")
for filePath in filePaths:
    sgmlDocuments += ReadDocumentsFromFile("reuters21578/reut2-000.sgm", previewLines)
    previewLines = 0

0: <!DOCTYPE lewis SYSTEM "lewis.dtd">
1: <REUTERS TOPICS="YES" LEWISSPLIT="TRAIN" CGISPLIT="TRAINING-SET" OLDID="5544" NEWID="1">
2: <DATE>26-FEB-1987 15:01:01.79</DATE>
3: <TOPICS><D>cocoa</D></TOPICS>
4: <PLACES><D>el-salvador</D><D>usa</D><D>uruguay</D></PLACES>
5: <PEOPLE></PEOPLE>
6: <ORGS></ORGS>
7: <EXCHANGES></EXCHANGES>
8: <COMPANIES></COMPANIES>
9: <UNKNOWN> 
10: &#5;&#5;&#5;C T
11: &#22;&#22;&#1;f0704&#31;reute
12: u f BC-BAHIA-COCOA-REVIEW   02-26 0105</UNKNOWN>
13: <TEXT>&#2;
14: <TITLE>BAHIA COCOA REVIEW</TITLE>
15: <DATELINE>    SALVADOR, Feb 26 - </DATELINE><BODY>Showers continued throughout the week in
16: the Bahia cocoa zone, alleviating the drought since early
17: January and improving prospects for the coming temporao,
18: although normal humidity levels have not been restored,
19: Comissaria Smith said in its weekly review.
...
32720: </REUTERS>



In [377]:
numDocuments = len(sgmlDocuments)
print("Extracted %s documents from %d files" % ("{:,}".format(numDocuments), len(filePaths)))

Extracted 22,000 documents from 22 files


# Make an index
P115: Scoring algorithm
  1. "store N/df_t  at the head of the postings for t" to compute idf_t
  1. "store the term frequency tf_t,d for each postings entry"
  1. Use a heap as a priority queue for document scores

## Build an index

In [398]:
from collections import OrderedDict 

def ContentTokens(content):
    # Replace symbols and numbers with spaces
    content = "".join([(c if c.isalpha() else " ") for c in content if c != "'"])
    allTokens = content.split()
    return [x.lower() for x in allTokens if len(x) > 1 and x.replace("'", "a").isalpha()]

def CombinedTermCounts(a, b):
    for term, count in b.items():
        a[term] = (a[term] if term in a else 0) + count
    return a

def GetTermCounts(tagTuple):
    # Get term counts for this tag's content
    tokens = ContentTokens(tagTuple[TT.CONTENT])
    terms = set(tokens)
    termCounts = dict([(x, tokens.count(x)) for x in terms])

    # Accumulate term counts from sub tags
    for _, subTagTuples in tagTuple[TT.SUBTAGS].items():
        for subTagTuple in subTagTuples:
            subTagTermCounts = GetTermCounts(subTagTuple)
            termCounts = CombinedTermCounts(termCounts, subTagTermCounts)

    return termCounts

class TermPostings:
    """Postings list with metrics for a single term"""
    def __init__(self):
        self.postingsList = dict()
        self.cf = 0
        self.df = 0
        self.idf = 0
        self.log_idf = 0

In [394]:
# For each term, count occurrences in all documents
postings = dict()
documentTermCounts = [dict()] * numDocuments
for i, doc in enumerate(sgmlDocuments):
    termCounts = GetTermCounts(doc)
    documentTermCounts[i] = termCounts

    for term, count in termCounts.items():
        if term not in postings:
            postings[term] = TermPostings()
        postings[term].postingsList[i] = (count)

# Calculate frequency metrics
for _, termPostings in postings.items():
    termPostings.cf = np.sum([x for x in index['tin'].postingsList.values()])
    termPostings.idf = numDocuments / len(termPostings.postingsList)
    termPostings.log_idf = np.log(termPostings.idf)

# Sort by term
index = OrderedDict([(x, postings[x]) for x in sorted(postings)])

## Build document vectors
To make vector scoring possible, we create a normalized vector representation for each document. Indexing and general performance improvements for document vector usage are not covered until chapter 7, so for now we're using brute force methods.

Document vectors can be created by normalizing the document term count vectors -- we carry over the (term, count) format.

In [559]:
class WeightsVector:
    """Vector representation of a document or query"""
    def __init__(self, termWeights):
        self.termWeights = termWeights
        self.length = 1

    def CosineSimilarity(self, vector):
        # It's more efficient if the shorter vector does the calculations
        if len(self.termWeights) > len(vector.termWeights):
            return vector.CosineSimilarity(self)

        # Dot product
        dot = np.sum([self.termWeights[x] * vector.termWeights[x] for x in self.termWeights if x in vector.termWeights])
        return dot / (self.length * vector.length)

    @staticmethod
    def FromString(string):
        # Extract tokens and count terms
        tokens = ContentTokens(string)
        terms = set(tokens)
        termCounts = dict([(x, tokens.count(x)) for x in terms])
        return WeightsVector.FromTermCounts(termCounts)        

    @staticmethod
    def FromTermCounts(termCounts):
        # Normalize term counts
        vectorLength = np.sqrt(np.sum([np.square(x) for x in termCounts.values()]))
        termWeights = dict([
            (termCount[0], termCount[1] / vectorLength)
            for termCount in termCounts.items()
        ])
        return WeightsVector(termWeights)


In [501]:
documentVectors = [None] * numDocuments
for i, counts in enumerate(documentTermCounts):
    documentVectors[i] = WeightsVector.FromTermCounts(counts)

## Run some queries using vector space scoring

In [585]:
def GetSimilarVectors(weightsVector):
    # Score each document vector
    scores = dict()
    for i, documentVector in enumerate(documentVectors):
        scores[i] = queryVector.CosineSimilarity(documentVector)
    
    # Sort by highest score
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

def GetQueryVector(query):
    return WeightsVector.FromString(query)

def GetQueryScores(query):
    queryVector = GetQueryVector(query)
    return GetSimilarVectors(queryVector)

def GetBestResult(query):
    bestMatch = GetQueryScores(query)[0]
    if (bestMatch[1] > 0):
        return sgmlDocuments[bestMatch[0]]
    else:
        print("No documents contain any of the queried terms " + str(ContentTokens(query)))
        return None

In [573]:
GetBestResult("soup kitchen")

No documents contain any of the queried terms ['soup', 'kitchen']


In [582]:
GetBestResult("presidential election")

('REUTERS',
 {'CGISPLIT': '"TRAINING-SET"',
  'LEWISSPLIT': '"TRAIN"',
  'NEWID': '"158"',
  'OLDID': '"5701"',
  'TOPICS': '"NO"'},
 {'COMPANIES': [('COMPANIES', {}, {}, '')],
  'DATE': [('DATE', {}, {}, '26-FEB-1987 17:41:08.82')],
  'EXCHANGES': [('EXCHANGES', {}, {}, '')],
  'ORGS': [('ORGS', {}, {}, '')],
  'PEOPLE': [('PEOPLE', {}, {}, '')],
  'PLACES': [('PLACES', {}, {'D': [('D', {}, {}, 'usa')]}, '')],
  'TEXT': [('TEXT',
    {},
    {'BODY': [('BODY',
       {},
       {},
       'Presidential Airways Inc said its joint marketing and services agreement with Texas Air Corp\'s &lt;TXN> Continental Airlines unit was approved by the U.S. Department of Justice. According to the agreement, Presidential Airways will operate scheduled service under the name "Continental Express." The company, however, will remain independent. Reuter &#3;')],
     'DATELINE': [('DATELINE', {}, {}, 'WASHINGTON, FEB 26 -')],
     'TITLE': [('TITLE',
       {},
       {},
       'PRESIDENTIAL AIRWAYS &lt