# A Simple Boolean Retrieval System

In [1]:
from functools import total_ordering, reduce  
import csv  # Import the csv module for CSV file parsing
import re  # Import the re module for regular expression operations


### Postings

In [2]:
@total_ordering  # This decorator will add all rich comparison methods based on the definitions of __eq__ and __gt__.
class Posting:    # The class represents a 'Posting' in an index 
    
    def __init__(self, docID):
        # The initializer method for the class, which takes a document ID as an argument.
        self._docID = docID  # The document ID is stored in a protected member variable.
        
    def get_from_corpus(self, corpus):
        # A method to retrieve a document's contents from a corpus using the stored document ID.
        return corpus[self._docID]  # Returns the document associated with the document ID from the corpus.
    
    def __eq__(self, other):
        # Special method to check equality with another Posting, based on document ID.
        return self._docID == other._docID  # Returns True if the document IDs are equal, otherwise False.
    
    def __gt__(self, other):
        # Special method to check if this Posting is greater than another Posting, based on document ID.
        return self._docID > other._docID  # Returns True if this document ID is greater than the other's.
    
    def __repr__(self):
        # Special method to provide the official string representation of the Posting.
        return str(self._docID)  # Returns the string form of the document ID.


In [16]:
Posting("123")<Posting("3333")

True

### Posting Lists

In [4]:
class PostingList:
    # This class represents a collection  of postings
    
    def __init__(self):
        # The initializer method for the class. It initializes an empty list of postings.
        self._postings = []  # Protected member variable that holds the list of postings.

    @classmethod
    def from_docID(cls, docID):
        # A class method to create a PostingList instance with a single Posting from a document ID.
        plist = cls()  # Creates a new instance of the class.
        plist._postings = [(Posting(docID))]  # Initializes the postings list with a single Posting.
        return plist  # Returns the newly created PostingList instance.
    
    @classmethod
    def from_posting_list(cls, postingList):
        # A class method to create a PostingList instance from an existing list of Postings.
        plist = cls()  # Creates a new instance of the class.
        plist._postings = postingList  # Sets the postings list to the provided list.
        return plist  # Returns the newly created PostingList instance.

    def merge(self, other):
        # A method to merge another PostingList into this one, avoiding duplicates.
        i = 0  # Start index for the other PostingList.
        last = self._postings[-1]  # The last Posting in the current list.
        # Loop through the other PostingList and skip duplicates.
        while (i < len(other._postings) and last == other._postings[i]):
            i += 1  # Increment the index if a duplicate is found.
        self._postings += other._postings[i:]  # Append the non-duplicate postings from the other list.
    
    def intersection(self, other):
        # A method to compute the intersection of this PostingList with another.
        intersection = []  # Start with an empty list for the intersection.
        i = 0  # Index for this PostingList.
        j = 0  # Index for the other PostingList.
        # Loop until one of the lists is exhausted.
        while (i < len(self._postings) and j < len(other._postings)):
            # If both postings are equal, add to the intersection.
            if (self._postings[i] == other._postings[j]):
                intersection.append(self._postings[i]) 
                i += 1
                j += 1
            # If the current posting is less, increment this list's index.
            elif (self._postings[i] < other._postings[j]):
                i += 1
            # If the other posting is less, increment the other list's index.
            else:
                j += 1
        return PostingList.from_posting_list(intersection)  # Return a new PostingList of the intersection.



    def union(self, other):
        # A method to compute the union of this PostingList with another.
        union = []  # Start with an empty list for the union.
        i = 0  # Index for this PostingList.
        j = 0  # Index for the other PostingList.
        # Loop until one of the lists is exhausted.
        while (i < len(self._postings) and j < len(other._postings)):
            # If both postings are equal, add to the union and increment both indexes.
            if (self._postings[i] == other._postings[j]):
                union.append(self._postings[i])
                i += 1
                j += 1
            # If the current posting is less, add it to the union and increment this list's index.
            elif (self._postings[i] < other._postings[j]):
                union.append(self._postings[i])
                i += 1
            # Otherwise, add the other posting to the union and increment the other list's index.
            else:
                union.append(other._postings[j])
                j += 1
        # Add any remaining postings from both lists to the union.
        for k in range(i, len(self._postings)):
            union.append(self._postings[k])
        for k in range(j, len(other._postings)):
            union.append(other._postings[k])
        return PostingList.from_posting_list(union)  # Return a new PostingList of the union.
    
    
    def get_from_corpus(self, corpus):
        # A method to retrieve the contents of each Posting from a corpus.
        return list(map(lambda x: x.get_from_corpus(corpus), self._postings))  # Use map to apply the retrieval to each Posting.
    
    def __repr__(self):
        # Special method to provide the official string representation of the PostingList.
        return ", ".join(map(str, self._postings))

### Terms

In [5]:
# Define a custom exception class for handling errors specific to merge operations.
class ImpossibleMergeError(Exception):
    pass

# The total_ordering decorator will automatically provide the other comparison methods based on __eq__ and __gt__.
@total_ordering
class Term:
    # A class that represents a term in a document, along with its posting list.

    def __init__(self, term, docID):
        # The initializer method for the class, taking a term and a document ID as arguments.
        self.term = term  # Public attribute to store the term.
        # Initialize posting_list for the term with a PostingList created from the given document ID.
        self.posting_list = PostingList.from_docID(docID)
        
    def merge(self, other):
        # A method to merge another Term's posting list into this one if they have the same term.
        if (self.term == other.term):
            # If terms match, merge the posting lists.
            self.posting_list.merge(other.posting_list)
        else:
            # If terms don't match, it's not possible to merge, so raise an exception.
            raise ImpossibleMergeError
    
    def __eq__(self, other):
        # Special method to check equality with another Term based on the term string.
        return self.term == other.term  # Comparison is done lexicographically.
    
    def __gt__(self, other):
        # Special method to determine if this Term is greater than another, based on the term string.
        return self.term > other.term  # Comparison is done lexicographically.
    
    def __repr__(self):
        # Special method to provide the official string representation of the Term.
        return self.term + ": " + repr(self.posting_list)  # Concatenate the term and its posting list's string representation.

### Inverted Index

In [None]:
# Function to normalize text by removing punctuation, converting to lowercase.
def normalize(text):
    # Removes punctuation from the text using a regular expression.
    no_punctuation = re.sub(r'[^\w\s^-]', '', text)
    # Converts the text to lowercase.
    downcase = no_punctuation.lower()
    # Returns the normalized text.
    return downcase

# Function to tokenize the description of a movie into individual words.
def tokenize(movie):
    # Normalize the movie description.
    text = normalize(movie.description)
    # Split the text into a list of tokens (words) and return it.
    return list(text.split())

# Define a class that represents an inverted index.
class InvertedIndex:
    
    def __init__(self):
        # Initialize the inverted index with an empty dictionary.
        self._dictionary = []
        
    # Class method to create an inverted index from a corpus of documents.
    @classmethod
    def from_corpus(cls, corpus):
        # Create an intermediate dictionary to store terms and their postings.
        intermediate_dict = {}
        # Iterate over the documents in the corpus.
        for docID, document in enumerate(corpus):
            # Tokenize the document into individual words.
            tokens = tokenize(document) 
            for token in tokens:   ## Hello in document 37
                # Create a new term with the token and the current document ID.
                term = Term(token, docID) ## Helly -> 37
                try:
                    # Try to merge the term with existing one in the intermediate dictionary.
                    intermediate_dict[token].merge(term)
                except KeyError:
                    # If the term is not already in the dictionary, add it.
                    intermediate_dict[token] = term
            # Print progress for every 1000 documents processed.
            if (docID % 1000 == 0):
                print("ID: " + str(docID))
        # Create a new InvertedIndex instance.
        idx = cls()
        # Sort the terms in the intermediate dictionary and store them in the index's dictionary.
        idx._dictionary = sorted(intermediate_dict.values(), key=lambda term: term.term)
        # Return the newly created inverted index.
        return idx
    
    # Special method to retrieve the posting list for a given term.
    def __getitem__(self, key):
        # Iterate over the terms in the dictionary.
        for term in self._dictionary:
            # If the term matches the key, return its posting list.
            if term.term == key:
                return term.posting_list
        # If the term is not found, raise a KeyError.
        raise KeyError
    
    # Special method to provide a string representation of the inverted index.
    def __repr__(self):
        # Returns a string indicating the number of terms in the dictionary.
        return "A dictionary with " + str(len(self._dictionary)) + " terms"


### Reading the Corpus

In [7]:
# Define a class to hold the title and description of a movie.
class MovieDescription:
    
    def __init__(self, title, description):
        # Constructor for the class that initializes the title and description attributes.
        self.title = title
        self.description = description
        
    def __repr__(self):
        # Special method to provide the string representation of the MovieDescription object.
        # It returns the movie's title when the object is printed or shown in the interpreter.
        return self.title
    
# Define a function to read movie descriptions and titles from files.
def read_movie_descriptions(filename, movie_names_file ):
    # Names of the files containing plot summaries and metadata respectively.
    filename = filename
    movie_names_file = movie_names_file
    
    # Open the movie metadata file and read it line by line.
    with open(movie_names_file, 'r', encoding="utf8") as csv_file:
        # Create a csv.reader object to read the file with tab as the delimiter.
        movie_names = csv.reader(csv_file, delimiter='\t')
        # Initialize a dictionary to hold movie IDs and their corresponding titles.
        names_table = {}
        for name in movie_names:
            # Populate the dictionary with movie ID as key and title as value.
            names_table[name[0]] = name[2]
    
    # Open the file containing plot summaries and read it line by line.
    with open(filename, 'r', encoding="utf8") as csv_file:
        # Create a csv.reader object to read the file with tab as the delimiter.
        descriptions = csv.reader(csv_file, delimiter='\t')
        # Initialize a list to hold the corpus of movie descriptions.
        corpus = []
        for desc in descriptions:
            try:
                # Create a MovieDescription object using the title from names_table and the description from the file.
                movie = MovieDescription(names_table[desc[0]], desc[1])
                # Add the MovieDescription object to the corpus.
                corpus.append(movie)
            except KeyError:
                # If the movie ID is not found in names_table, ignore this description.
                pass
        # Return the populated list of MovieDescription objects.
        return corpus


### Putting it all together

In [None]:
# Define a class for an Information Retrieval (IR) system.
class IRsystem:
    
    def __init__(self, corpus, index):
        # Initialize the IR system with a corpus (collection of documents) and the inverted index.
        self._corpus = corpus  # The corpus of documents.
        self._index = index  # The inverted index for the corpus.
        
    @classmethod
    def from_corpus(cls, corpus):
        # Class method to create an IR system instance from a given corpus.
        # It creates an inverted index from the corpus first.
        index = InvertedIndex.from_corpus(corpus)
        # Returns an instance of the IR system with the given corpus and created index.
        return cls(corpus, index)
    
    # Method to answer a query given a list of words.
    def answer_query(self, words):         ## ['cat', 'batman']
        # Normalize the words in the query to match the normalized index terms.
        norm_words = map(normalize, words)
        # Retrieve the posting lists for each normalized word from the index.
        postings = map(lambda w: self._index[w], norm_words)
        # Reduce the list of posting lists by intersecting them, leaving only the common documents.
        plist = reduce(lambda x, y: x.intersection(y), postings)
        # Return the list of documents from the corpus that match all query words.
        return plist.get_from_corpus(self._corpus)
    
# Function to execute a text query against an IR system.
def query(ir, text):
    # Split the text query into individual words.
    words = text.split()
    # Get the answer to the query using the IR system's answer_query method.
    answer = ir.answer_query(words)
    # Print out each movie that matches the query.
    for movie in answer:
        print(movie)


In [17]:
corpus = read_movie_descriptions(filename = 'data/plot_summaries.txt', movie_names_file = 'data/movie.metadata.tsv') 

In [18]:
ir = IRsystem.from_corpus(corpus)

ID: 0
ID: 1000
ID: 2000
ID: 3000
ID: 4000
ID: 5000
ID: 6000
ID: 7000
ID: 8000
ID: 9000
ID: 10000
ID: 11000
ID: 12000
ID: 13000
ID: 14000
ID: 15000
ID: 16000
ID: 17000
ID: 18000
ID: 19000
ID: 20000
ID: 21000
ID: 22000
ID: 23000
ID: 24000
ID: 25000
ID: 26000
ID: 27000
ID: 28000
ID: 29000
ID: 30000
ID: 31000
ID: 32000
ID: 33000
ID: 34000
ID: 35000
ID: 36000
ID: 37000
ID: 38000
ID: 39000
ID: 40000
ID: 41000
ID: 42000


In [19]:
query(ir, "yoda")

Star Wars Episode V: The Empire Strikes Back
Star Wars Episode II: Attack of the Clones
George Lucas in Love
Something, Something, Something Dark Side
Return of the Ewok
Aliens in the Wild, Wild West
Star Wars Episode III: Revenge of the Sith
Star Wars Episode VI: Return of the Jedi
Star Wars: The Clone Wars
Gulliver's Travels
Lego Star Wars: The Quest for R2-D2
It's a Trap!
LEGO Star Wars: Revenge of the Brick


In [20]:
query(ir, "frodo ring")

The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings
The Hunt for Gollum
The Return of the King
Date Movie
The Lord of the Rings: The Two Towers
The Lord of the Rings: The Return of the King


In [28]:
query(ir, "frodo ring lord")

The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings
The Hunt for Gollum
The Lord of the Rings: The Two Towers
