# Question 3 - Text Document Similarity

## Program code

In [2]:
# imports
import numpy as np
import math

In [3]:
# define the Corpus class used to run the text document similarity program
class Corpus:
    """
    Class to compute to the document similarity between different documents.
    """

    def __init__(self, name, lowercase=True, remove_punctuation=True, remove_numbers=True):
        """
        Initiates a new Corpus object.
        Also defines what string cleaning processes should be done on the Corpus documents and search document.
        
        Args:
            name: Name of the new corpus, can be of any type (string, float etc.)
            lowercase: Whether to transform the documents to lowercase (default: True) 
            remove_punctuation: Whether to remove punctuation from the documents (default: True)
            remove_numbers: Whether to remove numbers from the documents (default: True)
            
        """
        self.name = name
        self.documents = []
        self.word_dict = {}
        self.array_all_vectors = None
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation 
        self.remove_numbers = remove_numbers


    def add_doc(self, document):
        """
        Funtion to add new documents to the corpus.
        When the document has been removed, the word vectors and word dictionary is recalculated for the corpus.
        
        Args:
            document: Document to be added to the corpus, can be a list of documents or a single document. 

        Returns:
            None  
        """
        
        # add a list of documents to the corpus
        # checks if input is a list of strings
        if ( isinstance(document, list) ) and ( all(isinstance(item, str) for item in document) ):
            # add the list of documents to the corpus
            self.documents.extend(document)
            
            # re-calculate word dictionary and word vectors for new Corpus
            self.create_vector_representation()
        
        # add one single document to the corpus
        # checks if input is a string
        elif isinstance(document, str):
            # add the document to the corpus
            self.documents.append(document)
            
            # re-calculate word dictionary and word vectors for new Corpus
            self.create_vector_representation()
            
        # print an error if input is neither string or list of strings
        else:
            print(f"Error: Input document(s) must be a string or a list of strings. No new documents added to the {self.name} corpus, please try again.")
            
        return

            
    def remove_doc(self, document):
        """
        Funtion to remove a document from the corpus. 
        When the document has been removed, the word vectors and word dictionary is recalculated for the corpus.
        
        Args:
            document: Document to be added to the corpus, can be a list of documents or a single document. 

        Returns:
            None
        """
        
        # remove documents from the corpus
        try:
            self.documents.remove(document)
            
            # re-calculate word dictionary and word vectors for new Corpus
            self.create_vector_representation()
        
        # try/except is appropriate 
        except ValueError:
            if type(document) == str:
                print(f"Error: Input document does not exist in the {self.name} corpus, please try again.")
            elif type(document) == list:
                print(f"Error: Only one document can be removed at a time, please try again.")
            else:
                print("Error: Invalid input entered. The input must be a single document, please try again.")
        
        return
    
    
    def show_docs(self):
        """ 
        Function to print out a list of all documents in the Corpus
        
        Args:
            None
            
        Return:
            None
            
        Print to screen:
            List of all documents in the Corpus
        """
        
        nl = '\n -> '

        print(f"All documents in the {self.name} Corpus: {nl}{nl.join(self.documents)}\n")

        return
            
    def create_vector_representation(self):
        """
        Helper function to compute a dictionary of the corpus and 
        create a vector represention of the documents in the corpus. 
        
        Args:
            None

        Returns:
            Vector representation of the documents in the corpus.
        """

        # create dictionary of words with unique words as key and count as values
        for document in self.documents:
            
            # make the text lowercase to not differenciate between upper and lowercase words
            if self.lowercase == True:
                document = document.lower()
                        
            if self.remove_punctuation == True:

                # remove puntuation
                punctuation_all = "!#$%&\'()*+,-./:;<=>?@[\]^_`{|}~"

                # replace all punctuation with a empty space
                for punctuation in punctuation_all:
                    document = document.replace(punctuation, ' ')
                
            if self.remove_numbers == True:
                
                # remove numbers from the document
                numbers = ['0123456789']
                
                for number in numbers:
                    document = document.replace(number, ' ')
            
            # Split the document into single words and iterate over the words to check if it appears in the corpus
            for word in document.split(): # split makes spaces to commas by default 
                
                # Count how many times the word appears in the corpus and set it as a value of the dictioary.
                if word in self.word_dict:
                    self.word_dict[word] += 1
                
                else:
                    self.word_dict[word] = 1

        # create an empty list to store the vectors from the next step
        list_all_vectors = []
        
        # create vector out of given dictionary of words
        for document in self.documents:
            
            if self.lowercase == True:
                document = document.lower()
                        
            if self.remove_punctuation == True:

                # remove puntuation
                punctuation_all = "!#$%&\'()*+,-./:;<=>?@[\]^_`{|}~"

                # replace all punctuation with a empty space
                for punctuation in punctuation_all:
                    document = document.replace(punctuation, ' ')
                
            if self.remove_numbers == True:
                
                # remove numbers from the document
                numbers = ['0123456789']
                
                for number in numbers:
                    document = document.replace(number, ' ')
            
            vector_list = []
            # creates a vector of 0 and 1's for every word in the dict. If the word was in the text, then 1, if not, then 0
            for word in self.word_dict:

                if word in document:
                    vector_list.append(1)

                else:
                    vector_list.append(0) 
            
            # append the vector representation of each text to the list of all vectors
            list_all_vectors.append(vector_list)

        # create numpy array out of the list of lists (vector)
        self.array_all_vectors = np.array(list_all_vectors)
        
        return self.array_all_vectors # statement could be removed (can be used to print the vector representation)

    
    def create_vector_search_document(self, search_document):
        """
        Helper function to create vector representation out of the search document.
        
        Args:
            search_document: Document that should be compared with the documents in the corpus.

        Returns:
            Vector representation of the search document. 
        """
        vector_search_document = []
        
        # make the text lowercase to not differenciate between upper and lowercase words
        if self.lowercase == True:
            search_document = search_document.lower()
                
        # remove punctuation
        if self.remove_punctuation == True:
            
            punctuation_all = "!#$%&\'()*+,-./:;<=>?@[\]^_`{|}~"
            
            # replace all punctuation with a empty space
            for punctuation in punctuation_all:
                document = search_document.replace(punctuation, ' ')
                
        if self.remove_numbers == True:
                
            # remove numbers from the document
            numbers = ['0123456789']
            
            for number in numbers:
                document = document.replace(number, ' ')
        
        for word in self.word_dict:
            
            # if a word from the corpus exists in the document add a 1 to the vector
            if word in search_document:
                vector_search_document.append(1)
            
            # if a word from the corpus does not exists in the document add a 0 to the vector
            else:
                vector_search_document.append(0)
        
        return vector_search_document
        
    
    def similarity_euc_distance(self, search_document):
        """
        Helper function to calculate the euclidean distance between the words in the corpus and the search document.
        
        Args:
            search_document: Document that should be compared with the documents in the corpus.

        Returns:
            A list of the documents in the corpus, ordered by similarity to the search document.
        """
        
        vector_search_document = self.create_vector_search_document(search_document)

        # Euclidean distance, low distance means documents are similar - using NumPy
        #euc_dist_numpy = np.linalg.norm(self.array_all_vectors - vector_search_document, axis=1)

        # Euclidean distance, low distance means documents are similar - using math.dist
        euc_dist = np.array([])
        for i in range(len(self.array_all_vectors)):
            euc_dist = np.append(euc_dist, math.dist(self.array_all_vectors[i], vector_search_document))

        # create a sorted list of tuples containing the number of words in common and the corresponding document
        similarity_list = sorted(zip(euc_dist, self.documents), reverse=False) #ascending order 

        # extract the second element of the tuples from the list above 
        similar_documents = [x[1] for x in similarity_list]

        return similar_documents

        
    def similarity_dot_product(self, search_document):
        """
        Helper function to calculate the dot product between the words in the corpus and the search document.
        
        Args:
            search_document: Document that should be compared with the documents in the corpus.

        Returns:
            A list of the documents in the corpus, ordered by similarity to the search document.
        """
        
        vector_search_document = self.create_vector_search_document(search_document)

        # calculate the dot product between the two vectors, high value means documents are similar
        dot_product = np.dot(self.array_all_vectors, vector_search_document)

        # create a sorted list of tuples containing the number of words in common and the corresponding document
        similarity_list = sorted(zip(dot_product, self.documents), reverse=True)

        # extract the second element of the tuples from the list above 
        similar_documents = [x[1] for x in similarity_list]

        return similar_documents
        
    
    def show_similar_documents(self, similarity_type, search_document, top_n=False):
        """
        Function to print a list similar documents to the search document, in descending order of similarity.
        
        Args:
            similarity_type: Which distance/similarity measure to use in the similarity calculation
                           "euc" will find the Euclidean distance
                           "dot" will find the dot product
            search_document: Document that should be compared with the documents in the corpus.
            top_n: Number of similar documents to show, if False will show all documetns (Default: False)

        Returns:
            None
            
        Prints to screen:
            A list of the documents in the corpus, ordered by similarity to the search document.
        """
        
        nl = '\n -> '
        
        # check for invalid input
        if type(search_document) == str:
            
            # to show all results results:
            if top_n == False:

                # check which similarity calculation is to be used
                if similarity_type == "euc":
                    print(f"Similar documents using Euclidean distance (descending similarity, all documents shown):{nl}{nl.join(self.similarity_euc_distance(search_document))}\n")

                elif similarity_type == "dot":
                    print(f"Similar documents using dot product (descending similarity, all documents shown):{nl}{nl.join(self.similarity_dot_product(search_document))}\n")

                # give error if neither "euc" or "dot" given as input to similarity_type
                else:
                    print("Error: Invald input for similarity_type. Choose either \"dot\" for dot product or \"euc\" for Euclidean distance.")

            # show a specific number of results
            elif type(top_n) == int:

                # check that top_n is not bigger than number of documents
                # if bigger then make top_n equal to number of documents
                if top_n > len(self.documents):
                    top_n = len(self.documents)


                # check which similarity calculation is to be used
                if similarity_type == "euc":
                    print(f"The {top_n} most similar documents using Euclidean distance (descending similarity):{nl}{nl.join(self.similarity_euc_distance(search_document)[:top_n])}\n")

                elif similarity_type == "dot":
                    print(f"The {top_n} most similar documents using dot product (descending similarity):{nl}{nl.join(self.similarity_dot_product(search_document)[:top_n])}\n")

                # give error if neither "euc" or "dot" given as input to similarity_type
                else:
                    print("Error: Invald input for similarity_type. Choose either \"dot\" for dot product or \"euc\" for Euclidean distance.")

            else:
                print("Error: Invalid input for top_n given. Please enter an integer value and try again.")
        
        else:
            print('Error: The input given for search document is invalid. Please give a document or string and try again.')
            
        return
    
    
    def get_most_used_words(self, top_n=10): # returns it (help the next function)
        """
        Helper function to get the most used words the corpus.
        
        Args:
            top_n: Number of results (Default: 10)

        Returns:
            A list of the most used words in the corpus, sorted by frequency in descending order.
        """
        
        if type(top_n) == int:
            # sort words based on the frequency they appear in the corpus in descending order
            # the second element is the number of times the word appears therefore X[1]
            sorted_words = sorted(self.word_dict.items(), key=lambda x: x[1], reverse=True) #????

            # get the top_n words from the sorted words
            top_words = [word[0] for word in sorted_words][:top_n]

            return top_words
        
        else:
            print('Error: Invalid input given for top_n. Please enter an integer value and try again.')
            return 

    def show_most_used_words(self, top_n=10): # showing the nice list - prints to the output
        """
        Function to show the most used words the corpus to the user.
        
        Args:
            top_n: Number of results (Default: 10)

        Returns:
            None
            
        Prints to screen:
            A list of the most used words in the corpus, sorted by frequency in descending order.
        """
        
        if type(top_n) == int:
            print(f"The {top_n} most common words used in the corpus:")
            print(f"{self.get_most_used_words(top_n)}\n")

        else:
            print('Error: Invalid input given for top_n. Please enter an integer value and try again.')
            
        return         
    
    
    def __str__(self):
        """
        Function to overide the string representation for an object of the Corpus class, when the user uses print()
        """
        return f"Name of the Corpus:\n{self.name}\n\nNumber of documents in the Corpus:\n{len(self.documents)}\n\nPlease use the .show_docs() method to see a list of all documents in this Corpus.\n"


## Testing the program

In [4]:
#### Creating a Corpus ####

# create some test documents
document1 = "Hello this is an Example sentence."
document2 = "The sun is shining today."
document3 = "Hello! The sun is shining for example."
document4 = "This text has almost nothing to do with the other ones."
document5 = "This really long sentence has many words and is longer than the above sentences" 

# create search document
search_document = "Hello this is a good example for a sun."

# initiate a corpus
Corpus1 = Corpus('Newspaper')

# add either a list of documents or a single document to the corpus
Corpus1.add_doc([document1, document2, document3, document5])
Corpus1.add_doc(document4)

# get information on the Corpus created
print(Corpus1)
Corpus1.show_docs()
print(f"The search document is:\n{search_document}\n")


#### Running the similarity program on the new Corpus ####

# Find the most used words in a corpus
Corpus1.show_most_used_words(5)

# compare the search document with the documents contained in the corpus - dot product
Corpus1.show_similar_documents("dot", search_document)

# compare the search document with the documents contained in the corpus - euclidean, only top 3 results
Corpus1.show_similar_documents("euc", search_document, top_n=3)

Name of the Corpus:
Newspaper

Number of documents in the Corpus:
5

Please use the .show_docs() method to see a list of all documents in this Corpus.

All documents in the Newspaper Corpus: 
 -> Hello this is an Example sentence.
 -> The sun is shining today.
 -> Hello! The sun is shining for example.
 -> This really long sentence has many words and is longer than the above sentences
 -> This text has almost nothing to do with the other ones.

The search document is:
Hello this is a good example for a sun.

The 5 most common words used in the corpus:
['is', 'the', 'this', 'hello', 'example']

Similar documents using dot product (descending similarity, all documents shown):
 -> Hello! The sun is shining for example.
 -> Hello this is an Example sentence.
 -> This text has almost nothing to do with the other ones.
 -> This really long sentence has many words and is longer than the above sentences
 -> The sun is shining today.

The 3 most similar documents using Euclidean distance (desce

## Testing the error-handling

In [5]:
# testing error-handling 
Corpus1.add_doc(3)
print()

Corpus1.remove_doc("hi")
Corpus1.remove_doc(["hi", "hello"])
Corpus1.remove_doc(34)
print()

Corpus1.show_similar_documents("dot", 24, top_n=False)
Corpus1.show_similar_documents("dot", search_document, top_n="five")
Corpus1.show_similar_documents("manhattan", search_document, top_n=False)
print()

Corpus1.show_most_used_words("five")

Error: Input document(s) must be a string or a list of strings. No new documents added to the Newspaper corpus, please try again.

Error: Input document does not exist in the Newspaper corpus, please try again.
Error: Only one document can be removed at a time, please try again.
Error: Invalid input entered. The input must be a single document, please try again.

Error: The input given for search document is invalid. Please give a document or string and try again.
Error: Invalid input for top_n given. Please enter an integer value and try again.
Error: Invald input for similarity_type. Choose either "dot" for dot product or "euc" for Euclidean distance.

Error: Invalid input given for top_n. Please enter an integer value and try again.


## Testing with longer text document examples

In [6]:
# import some test documents
docs = []
for i in range(1,7):
    with open(f'DS_FinalProject_Question3_Documents/Doc{i}') as f:
        doc = f.read()
        docs.append(doc)

# import search document
with open('DS_FinalProject_Question3_Documents/Doc1') as f:
    search_doc = f.read()
    
# initialise new corpus
Corpus2 = Corpus('Stories')

# add documents to corpus
Corpus2.add_doc(docs)

# search document was also included when intially loaded, so remove from corpus
Corpus2.remove_doc(search_doc)

# see the Corpus
print(Corpus2)

# create vector representation of corpus
Corpus2.create_vector_representation()

# show top 2 similar documents, using dot product calculation
Corpus2.show_similar_documents("euc", search_document, top_n=2)

# show 7 most used words
Corpus2.show_most_used_words(7)

Name of the Corpus:
Stories

Number of documents in the Corpus:
5

Please use the .show_docs() method to see a list of all documents in this Corpus.

The 2 most similar documents using Euclidean distance (descending similarity):
 -> April seriously wondered about her sleeping partner choices. She looked at her bed and what a mess it had become. How did she get to the point in her life where she had two dogs, three cats, and a raccoon sleeping with her every night?
 -> He stepped away from the mic. This was the best take he had done so far, but something seemed missing. Then it struck him all at once. Visuals ran in front of his eyes and music rang in his ears. His eager fingers went to work in an attempt to capture his thoughts hoping the results would produce something that was at least half their glory.


The 7 most common words used in the corpus:
['the', 'a', 'i', 'to', 'and', 'my', 'out']

