# An Inefficient Vector Space Model

In [1]:
# Import defaultdict from the collections module.
# defaultdict is a dictionary-like object which provides a default value for a key that does not exist.
# so there is no error if the key does not exist
from collections import defaultdict

# Import log and sqrt functions from the math module.
# log is used for calculating logarithms and sqrt for calculating square roots.
from math import log, sqrt

# Import the re module for regular expression operations.
# This module provides support for regular expressions (pattern matching in strings).
import re


The dataset is the TIME dataset, available at http://ir.dcs.gla.ac.uk/resources/test_collections/time/

In [26]:
def import_dataset():
    """
    This function imports all the articles in the TIME corpus,
    returning a list of lists where each sub-list contains all the
    terms present in the document as a string.
    """
    # Initialize an empty list to store the articles.
    articles = []

    # Open the 'TIME.ALL' file in read mode.
    with open('data/TIME.ALL', 'r') as f:
        # Initialize a temporary list to store words of the current article.
        tmp = []

        # Iterate over each row in the file.
        for row in f:
            # Check if the row starts with '*TEXT', indicating a new article.
            if row.startswith("*TEXT"):
                # If tmp is not empty, it means we have reached a new article.
                # Add the previous article's words to the articles list.
                if tmp != []:
                    articles.append(tmp)
                # Reset tmp for the new article.
                tmp = []
            else:
                # Remove any non-alphabetic characters and split the row into words.
                # Regular expression is used here to replace non-letter characters with nothing ('').
                row = re.sub(r'[^a-zA-Z\s]+', '', row)
                # Extend the temporary list with the words from this row.
                tmp += row.split()

        # If the last article was not followed by a new '*TEXT', add it to the articles list.
        if tmp != []:
            articles.append(tmp)

    # Return the list of articles.
    return articles

In [4]:
def make_inverted_index(articles):
    """
    This function builds an inverted index as a hash table (dictionary)
    where the keys are the terms and the values are ordered sets of
    docIDs containing the term.
    """
    # Create a defaultdict where each key will have a set as its default value.
    # This is used to store the inverted index.
    index = defaultdict(set)

    # Enumerate over the articles, getting both the index (docid) and the article.
    # The index acts as a unique identifier for each document.
    for docid, article in enumerate(articles):
        # Iterate over each term in the article.
        for term in article:
            # Add the document ID to the set of docIDs for this term.
            # Since it's a set, each docID will be unique per term.
            index[term].add(docid)

    # Return the constructed inverted index.
    return index

In [46]:
index = defaultdict(set)

In [47]:
index["home"].add(1)
index["home"].add(2)
index["yes"].add(2)
index["more"].add(3)

In [48]:
index

defaultdict(set, {'home': {1, 2}, 'yes': {2}, 'more': {3}})

In [49]:
def make_positional_index(articles):
    """
    A more advanced version of make_inverted_index.
    This function builds a positional inverted index as a dictionary. 
    Here, for each term in the articles, the index stores a dictionary 
    where the keys are document IDs and the values are lists of positions 
    where the term appears in the document.
    """
    # Create a defaultdict of dictionaries. The outer dictionary holds terms,
    # and the inner dictionary maps document IDs to a list of positions.
    index = defaultdict(dict)

    # Enumerate over the articles to get both the document ID (docid) and the article itself.
    for docid, article in enumerate(articles):
        # Enumerate over each term in the article to get both the position (pos) and the term.
        for pos, term in enumerate(article):
            try:
                # Try to append the position to the existing list for this term in this document.
                index[term][docid].append(pos)
            except KeyError:
                # If the term or document ID doesn't exist yet in the index,
                # create a new entry with the current position in a list.
                index[term][docid] = [pos]

    # Return the constructed positional inverted index.
    return index

In [51]:
articles = [["home","no","yes", "home"],[ "more", "advanced", "version","no","yes"]]

In [52]:
make_positional_index(articles)

defaultdict(dict,
            {'home': {0: [0, 3]},
             'no': {0: [1], 1: [3]},
             'yes': {0: [2], 1: [4]},
             'more': {1: [0]},
             'advanced': {1: [1]},
             'version': {1: [2]}})

In [55]:
def documents_as_vectors(articles):
    """
    This function generates a list of dictionaries, where each dictionary represents 
    the TF-IDF vector of a document. Each term's TF-IDF value is calculated and 
    stored in the dictionary. This function is suitable for small collections 
    as its space complexity is O(#documents x #terms).
    """
    # Generate a positional index from the articles
    p_index = make_positional_index(articles)

    # Initialize an empty list to store the TF-IDF vectors of each document
    vectors = []

    # Calculate the total number of documents
    n = len(articles)

    # Calculate the Inverse Document Frequency (IDF) for each term
    idf = {}
    for term in p_index.keys():
        idf[term] = log(n / len(p_index[term]))

    # Iterate over each document to create its TF-IDF vector
    for docid in range(0, len(articles)):
        # Initialize an empty dictionary for the TF-IDF vector of the current document
        v = {}

        # Calculate TF-IDF for each term and store it in the vector
        for term in p_index.keys():
            try:
                # Term Frequency (TF) is the number of times a term occurs in this document
                # Multiply TF with IDF to get TF-IDF
                tfidf = len(p_index[term][docid]) * idf[term]
            except KeyError:
                # If the term is not in the document, its TF-IDF is 0
                tfidf = 0

            # Assign the TF-IDF score to the term in the vector
            v[term] = tfidf

        # Add the document's TF-IDF vector to the list
        vectors.append(v)

    # Return the list of TF-IDF vectors
    return vectors

In [56]:
p_index = make_positional_index(articles)
p_index.keys()

dict_keys(['home', 'no', 'yes', 'more', 'advanced', 'version'])

In [57]:
len(p_index['home'][0]), len(p_index['no'][0])

(2, 1)

In [58]:
def show_document_vector(v, docid):
    """
    This function prints the terms and their corresponding non-zero TF-IDF weights 
    (both normalized and unnormalized) for a given document represented as a vector in v.
    """
    # Create a list of terms with non-zero weights in the specified document's vector.
    non_zero_terms = [x for x in v[docid].keys() if v[docid][x] > 0]

    # Create a list of tuples (term, TF-IDF weight) for the non-zero terms.
    vector = [(x, v[docid][x]) for x in non_zero_terms]

    # Sort the vector in descending order based on the TF-IDF weights.
    vector.sort(key=lambda x: x[1], reverse=True)

    # Calculate the length of the vector (Euclidean norm) for normalization.
    length = sqrt(sum([x[1]**2 for x in vector]))

    # Normalize the vector by dividing each TF-IDF weight by the vector's length.
    normalized = {k: tfidf/length for k, tfidf in vector}

    # Print each term along with its unnormalized and normalized TF-IDF weight.
    for (term, tfidf) in vector:
        print(f"{term}:\t{tfidf}\t(normalized: {normalized[term]})")


In [59]:
# Example of usage
articles = import_dataset()
vectors = documents_as_vectors(articles)

In [60]:
len(articles)

423

In [61]:
" ".join(articles[0])

'THE ALLIES AFTER NASSAU IN DECEMBER THE US FIRST PROPOSED TO HELP NATO DEVELOP ITS OWN NUCLEAR STRIKE FORCE BUT EUROPE MADE NO ATTEMPT TO DEVISE A PLAN LAST WEEK AS THEY STUDIED THE NASSAU ACCORD BETWEEN PRESIDENT KENNEDY AND PRIME MINISTER MACMILLAN EUROPEANS SAW EMERGING THE FIRST OUTLINES OF THE NUCLEAR NATO THAT THE US WANTS AND WILL SUPPORT IT ALL SPRANG FROM THE ANGLOUS CRISIS OVER CANCELLATION OF THE BUGRIDDEN SKYBOLT MISSILE AND THE US OFFER TO SUPPLY BRITAIN AND FRANCE WITH THE PROVED POLARIS TIME DEC THE ONE ALLIED LEADER WHO UNRESERVEDLY WELCOMED THE POLARIS OFFER WAS HAROLD MACMILLAN WHO BY THUS KEEPING A SEPARATE NUCLEAR DETERRENT FOR BRITAIN HAD SAVED HIS OWN NECK BACK FROM NASSAU THE PRIME MINISTER BEAMED THAT BRITAIN NOW HAD A WEAPON THAT WILL LAST A GENERATION THE TERMS ARE VERY GOOD MANY OTHER BRITONS WERE NOT SO SURE THOUGH THE GOVERNMENT WILL SHOULDER NONE OF THE MILLION DEVELOPMENT COST OF POLARIS IT HAS ALREADY POURED MILLION INTO SKYBOLT AND WILL HAVE TO SPEND P

In [38]:
vectors[0]

{'THE': 0.0,
 'ALLIES': 5.042023308860233,
 'AFTER': 1.9206012227584774,
 'NASSAU': 16.40584811996386,
 'IN': 0.02366865010266244,
 'DECEMBER': 3.4083148494310196,
 'US': 11.311010314826785,
 'FIRST': 3.3297456857491694,
 'PROPOSED': 2.789275641024796,
 'TO': 0.06390535527718859,
 'HELP': 5.618954727451922,
 'NATO': 14.00280067405182,
 'DEVELOP': 7.9358612747328845,
 'ITS': 3.433808821427182,
 'OWN': 3.8391033752555757,
 'NUCLEAR': 22.63182545128017,
 'STRIKE': 5.2267699491222634,
 'FORCE': 13.730252417975137,
 'BUT': 0.4506549342286122,
 'EUROPE': 5.7607133820035585,
 'MADE': 1.1798377285906956,
 'NO': 3.024772342622422,
 'ATTEMPT': 2.4920241175568645,
 'DEVISE': 5.354224998486333,
 'A': 0.0,
 'PLAN': 2.1972245773362196,
 'LAST': 0.38691850583420445,
 'WEEK': 0.4671839016615207,
 'AS': 0.6673567705823108,
 'THEY': 1.3803705619380855,
 'STUDIED': 3.9679306373664422,
 'ACCORD': 4.437934266612178,
 'BETWEEN': 1.4029812799049053,
 'PRESIDENT': 2.254782506436306,
 'KENNEDY': 9.335200449367

In [62]:
show_document_vector(vectors, 0)

POLARIS:	23.375253845608476	(normalized: 0.23610685962782985)
NUCLEAR:	22.63182545128017	(normalized: 0.2285976986705899)
DE:	19.326040116876754	(normalized: 0.19520689149198198)
OFFER:	16.735653846148775	(normalized: 0.16904212889114578)
NASSAU:	16.40584811996386	(normalized: 0.16571085408184888)
GAULLE:	15.088863166549562	(normalized: 0.1524083597610793)
NATO:	14.00280067405182	(normalized: 0.1414383482199473)
FORCE:	13.730252417975137	(normalized: 0.13868541499987036)
DETERRENT:	13.633259397724078	(normalized: 0.13770571580308075)
SKYBOLT:	13.313802799836534	(normalized: 0.13447897462573138)
GOITALONE:	12.094744358092555	(normalized: 0.1221656084358262)
FRAPPE:	11.550442805130174	(normalized: 0.11666777165470212)
BRITAINS:	11.425424715928472	(normalized: 0.11540499912469891)
MACMILLAN:	11.31398541671231	(normalized: 0.11427938213029792)
US:	11.311010314826785	(normalized: 0.11424933146355849)
BRITAIN:	10.911150665198967	(normalized: 0.11021046169174444)
FLEET:	10.860670675484297	(no

In [29]:
vectors[3]

{'THE': 0.0,
 'ALLIES': 0,
 'AFTER': 0,
 'NASSAU': 0,
 'IN': 0.004733730020532488,
 'DECEMBER': 0,
 'US': 2.2622020629653568,
 'FIRST': 0.8324364214372924,
 'PROPOSED': 0,
 'TO': 0.014201190061597466,
 'HELP': 0,
 'NATO': 0,
 'DEVELOP': 0,
 'ITS': 0,
 'OWN': 0,
 'NUCLEAR': 0,
 'STRIKE': 0,
 'FORCE': 0,
 'BUT': 0,
 'EUROPE': 0,
 'MADE': 0,
 'NO': 0,
 'ATTEMPT': 0,
 'DEVISE': 0,
 'A': 0.0,
 'PLAN': 0,
 'LAST': 0,
 'WEEK': 0,
 'AS': 0.3336783852911554,
 'THEY': 0.920247041292057,
 'STUDIED': 0,
 'ACCORD': 0,
 'BETWEEN': 0,
 'PRESIDENT': 0,
 'KENNEDY': 0,
 'AND': 0.02846987107545582,
 'PRIME': 0,
 'MINISTER': 0,
 'MACMILLAN': 0,
 'EUROPEANS': 0,
 'SAW': 0,
 'EMERGING': 0,
 'OUTLINES': 0,
 'OF': 0.0,
 'THAT': 0,
 'WANTS': 0,
 'WILL': 0,
 'SUPPORT': 0,
 'IT': 0.2154897017627612,
 'ALL': 0,
 'SPRANG': 0,
 'FROM': 0.5089091917999167,
 'ANGLOUS': 0,
 'CRISIS': 0,
 'OVER': 0,
 'CANCELLATION': 0,
 'BUGRIDDEN': 0,
 'SKYBOLT': 0,
 'MISSILE': 0,
 'OFFER': 0,
 'SUPPLY': 0,
 'BRITAIN': 0,
 'FRANCE': 0

In [30]:
len(vectors)

423

In [33]:
len(vectors[4])

22496

In [43]:
def cosine_similarity(vec1, vec2):
    # Compute the dot product of the two vectors.
    # Since both vectors contain all terms (even with 0 values),
    # it's safe to just iterate over one of them.
    dot_product = sum(vec1[term] * vec2[term] for term in vec1)

    # Compute the Euclidean norm (magnitude) of each vector.
    norm1 = sqrt(sum(value**2 for value in vec1.values()))
    norm2 = sqrt(sum(value**2 for value in vec2.values()))

    # Avoid division by zero: if either vector is zero, return 0 similarity
    if norm1 == 0 or norm2 == 0:
        return 0.0

    # Return the cosine similarity
    return dot_product / (norm1 * norm2)


In [63]:
vectors = documents_as_vectors(articles)
sim = cosine_similarity({'ALLIES':1}, vectors[0])
print("Cosine similarity:", sim)


Cosine similarity: 0.050928058257171005
