# Simple bag of words searching

Exrcise from data science course.
The point of the exercise is to create a tool for finding a best file that matches a sentence provided by a user.
The similarity between the sentence and text file should be calculated by comparing bag of words vectors using dot product

In [1]:
# Necessary tools for opening, creating and saving files, parsing strings and performing computation

import numpy as np
import pickle
import string
import os

Let's list the text files from directory 'guttenberg' containing several different text files

In [2]:
os.listdir('gutenberg')

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

We'll need to find a number of all words and create a set that will help us determine whether a words appear in any text

In [3]:
# we'll use punctiation string to remove all puctuation from texts
string.punctuation
translator = str.maketrans('','',string.punctuation)

In [4]:
# in a loop we'll remove all punctuation from each file 
# at the same time we'll create a dictionary serving as a set of all unique words in all files
unique_words = {}
directory = 'gutenberg'
for file in os.listdir(directory):
    fp = open(directory+'/'+ file)
    text = fp.read()
    fp.close()
    text=text.translate(translator).split()
    for word in text:
        unique_words[word.lower()]=0


In [5]:
# number of all wards in all text files
no_words = len(list(unique_words))
no_words

57369

Now we'll create bag of words vector representation of each text

In [6]:
# we'll create an individual index for each word that will allow us to vectorise the text with numpy array
for i, word in enumerate(unique_words.keys()):
    unique_words[word] = i

In [7]:
# we'll save the index dictionary as a .pickle file
with open('words.pickle','wb') as fp:
    pickle.dump(unique_words,fp)

In [8]:
# now we'll create a vector for each text file and store those vectors in a dictionary with document names as keys
docs = {}
for file in os.listdir(directory):
    fp = open(directory+'/'+ file)
    text = fp.read()
    fp.close()
    text=text.translate(translator).split()
    docs[file] = np.zeros(no_words)
    for word in text:
        docs[file][unique_words[word.lower()]]+=1


In [9]:
# we'll save the vector dictionary
with open('docs.pickle','wb') as fp:
    pickle.dump(docs,fp)

In [10]:
# now let's create a function that will return the best matching title
def bow_search(text):
    text=text.translate(translator).split()
    vector = np.zeros(no_words)
    for word in text:
        if word in unique_words.keys():
            vector[unique_words[word.lower()]]+=1

    if np.linalg.norm(vector) == 0:
        return None
    else:        
        titles = list(docs)
        best_title = -1
        best_value = -1
        for i, title in enumerate(titles):
            similarity = np.dot(vector,docs[title])/(np.linalg.norm(vector)*np.linalg.norm(docs[title]))
#             print(title, similarity)
            if similarity > best_value:
                best_title = i
                best_value = similarity
#         print("\n",best_value,sep='')
        return titles[best_title]

In [11]:
# To find the best matching file we only need to evaluate the function
text = input("enter text: ")
print(bow_search(text))

enter text: paul
edgeworth-parents.txt
