In [1]:
import math
import string
import sys
import re
from collections import Counter


def read_file(filename):
    """Read a .txt file and return a sorted dictionary of all the unique words in that file,
    sorted alphabetically. The value of each word is the number of occurences for that word in the file.
    """
    try:
        fp = open(filename)
        L = fp.readlines()
        word_list = []
        sorted_dict = {}
        for line in L:
            words_in_line = [x.lower() for x in re.split("[^A-Za-z0-9]",line) if x]
            word_list.extend(words_in_line)
        unsorted_dict = dict(Counter(word_list))
        sorted_keys = sorted(unsorted_dict.keys())
        for key in sorted_keys:
            sorted_dict[key] = unsorted_dict[key]
    except IOError as excObj:
        print(str(excObj))
        print("Error opening or reading input file: " + filename)
        sys.exit()
    return sorted_dict


def inner_product(D1,D2):
    """Calculate the inner product between two vectors, 
    in this case two documents represented as two sorted dictionaries.
    """
    sum = 0.0
    i = 0
    j = 0
    L1 = list(D1.keys())
    L2 = list(D2.keys())
    while i<len(L1) and j<len(L2):
        if L1[i] == L2[j]:
            # both vectors have this word
            sum += D1[L1[i]] * D2[L2[j]]
            i += 1
            j += 1
        elif L1[i] < L2[j]:
            # word L1[i] is in L1 but not L2
            i += 1
        else:
            # word L2[j] is in L2 but not L1
            j += 1
    return sum

def vector_angle(D1,D2):
    numerator = inner_product(D1,D2)
    denominator = math.sqrt(inner_product(D1,D1)*inner_product(D2,D2))
    return numerator/denominator

def document_similarity(filename_1, filename_2, verbose=True):
    sorted_word_1 = read_file(filename_1)
    sorted_word_2 = read_file(filename_2)
    cosine = vector_angle(sorted_word_1,sorted_word_2)
    # Use f-strings; see https://realpython.com/python-f-strings/ for more information
    if verbose:
        print(f"The cosine between the documents is {cosine : 0.6f}.")
        print(f"The angle between the documents is {math.acos(cosine) : 0.6f} radians or {math.acos(cosine)*180/math.pi : .0f} degrees.")

In [2]:
document_similarity('data/t5.churchill.txt','data/t8.shakespeare.txt')

The cosine between the documents is  0.895120.
The angle between the documents is  0.462095 radians or  26 degrees.


In [5]:
%timeit document_similarity('data/t5.churchill.txt','data/t8.shakespeare.txt',verbose=False)

3.11 s ± 277 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
