In [1]:
import os
from words import get_text, words

In [2]:
def filelist(root):
    """Return a fully-qualified list of filenames under root directory"""
    filepaths = []
    for obj in os.listdir(root):
        obj_path = os.path.abspath(os.path.join(root,obj))
        if os.path.isfile(obj_path):
            filepaths.append(obj_path)
        elif os.path.isdir(obj_path):
            filepaths += filelist(obj_path)
    return filepaths

In [3]:
root = 'data/berlitz1'
files = filelist(root)
terms = ['make', 'kitchens']

def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """
    qualified_files = []
    for file in files:
        words_in_file = words(get_text(file))
        for term in terms:
            if term not in words_in_file:
                break
        else:
            qualified_files.append(file)
    return qualified_files

# get_text('data/berlitz1/HandRHawaii.txt').lower()

In [4]:
import re
import string

def clean_text(text):
    """ Function to return a list of words within text."""
    regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
    text = regex.sub( " ", text)  
    words = [w for w in text.split(" ") if len(w) > 0]
    return words

In [5]:
def get_search_result_string(file, terms, num_words):
    """ Function to get the HTML formated search result string
        for a given file and search terms. Returns a specified
        number of words. The substring that is choosen contains
        the larger number of search terms. 
    """
    # Find the position indicies of the search terms within the text
    text_words = get_text(file).lower() 
    word_pos_idxs = sorted([text_words.index(term) for term in terms])
    # Find the text substring that contains the most search terms
    current_idx_group = [word_pos_idxs[0]]
    max_idx_group = []
    for idx in word_pos_idxs[1:]:
        if abs(current_idx_group[0] - idx) < num_words:
            current_idx_group.append(idx)
        elif len(current_idx_group) > len(max_idx_group):
            current_idx_group, max_idx_group = [idx], current_idx_group
    max_idx_group = max(max_idx_group, current_idx_group, key=len)
    midpoint = int(sum(max_idx_group) / len(max_idx_group))
    start, end = midpoint - num_words // 2, midpoint + num_words // 2
    # Format the search string in HTML to bold search terms
    saerch_result_words = [f"<b>{word}</b>" if word in terms else word 
                            for word in text_words[start : end]]
    return ' '.join(saerch_result_words)

In [6]:
def get_file_search_result_HTML(file, terms, num_words=20):
    """ Function to get formatted HTML string for a valid search file. """
    html = f"<a href=\"file://{file}\">{file}</a><br>"
    html += get_search_result_string(file, terms, num_words) + "<br><br>"
    return html
    

In [7]:
def results(docs, terms):
    """
    Given a list of fully-qualifed filenames, return an HTML file
    that displays the results and up to 2 lines from the file
    that have at least one of the search terms.
    Return at most 100 results.  Arg terms is a list of string terms.
    """
    html = "<html>\n\t<body>\n\t<h2>Search results for <b>"
    html += f"{' '.join(terms)}</b> in {len(docs)} files </h2>"
    for doc in docs:
        html += f"\n\t\t{get_file_search_result_HTML(file, terms, num_words=20)}"
    html += "\n</body>\n</html>"
    return html

In [8]:
root = 'data/slate'
files = filelist(root)
terms = words('holiday')

docs = linear_search(files, terms)

In [57]:
file = docs[0]
num_words = 20

def html_format_string(search_result_words, terms):
    """ Function to convert search result words into HTML. """
    html_formated_words = []
    for word in search_result_words:
        if any(term in word for term in terms):
            html_formated_words.append(f"<b>{word}</b>")
        else:
            html_formated_words.append(word)
    return ' '.join(html_formated_words)


def get_search_result_string(file, terms, num_words):
    """ Function to get the HTML formated search result string
        for a given file and search terms. Returns a specified
        number of words. The substring that is choosen contains
        the larger number of search terms. 
    """
    # Find the position indicies of the search terms within the text
    text_words = get_text(file).lower() 
    word_pos_dict = {text_words.index(term) : term for term in terms}
    word_pos_idxs = sorted(word_pos_dict.keys())
    # Find the text substring that contains the most search terms
    current_idx_group = [word_pos_idxs[0]]
    max_idx_group = []
    for idx in word_pos_idxs[1:]:
        if abs(current_idx_group[0] - idx) < num_words * 4.5:
            current_idx_group.append(idx)
        elif len(current_idx_group) > len(max_idx_group):
            current_idx_group, max_idx_group = [idx], current_idx_group
    midpoint_idx = max(max_idx_group, current_idx_group, key=len)[0]
    midpoint_term = word_pos_dict[midpoint_idx]
    # Contruct HTML search result string
    prefix, _, suffix = text_words.partition(midpoint_term)
    prefix_words = [w for w in prefix.replace('\n',' ').split(' ') if w != '']
    suffix_words = [w for w in prefix.replace('\n',' ').split(' ') if w != '']
    search_result_words = (
        prefix_words[-1 * num_words // 2 :] + \
        [midpoint_term] + \
        suffix_words[: num_words // 2]
    )
    return html_format_string(search_result_words, terms)



In [65]:
from collections import defaultdict

root = 'data'
files = filelist(root)

def create_index(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """
    index = defaultdict(set)
    for idx, file in enumerate(files):
        for word in words(get_text(file)):
            index[word].add(idx)
    return index


In [290]:
terms = words('wheeler')

def index_search(files, index, terms):
    """
    Given an index and a list of fully-qualified filenames, return a list of
    filenames whose file contents has all words in terms parameter as normalized
    by your words() function.  Parameter terms is a list of strings.
    You can only use the index to find matching files; you cannot open the files
    and look inside.
    """
    qualified_files = index[terms[0]]
    for term in terms[1:]:
        qualified_files = qualified_files.intersection(index[term])
    return [file for idx, file in enumerate(files) if idx in qualified_files]


In [312]:
"""
A hashtable represented as a list of lists with open hashing.
Each bucket is a list of (key,value) tuples
"""


def htable(nbuckets):
    """Return a list of nbuckets lists"""
    return [[] for _ in range(nbuckets)]


def hashcode(o):
    """
    Return a hashcode for strings and integers; all others return None
    For integers, just return the integer value.
    For strings, perform operation h = h*31 + ord(c) for all characters in the string
    """
    if isinstance(o, int):
        return o
    elif isinstance(o, str):
        h = ord(o[0])
        for c in o[1:]:
            h = h*31 + ord(c)
        return h
    else:
        return None


def bucket_indexof(table, key):
    """
    You don't have to implement this, but I found it to be a handy function.
    Return the index of the element within a specific bucket; the bucket is:
    table[hashcode(key) % len(table)]. You have to linearly
    search the bucket to find the tuple containing key.
    """
    for idx, (bucket_key, _) in enumerate(table[hashcode(key) % len(table)]):
        if key == bucket_key:
            return idx
    return None


def htable_put(table, key, value):
    """
    Perform the equivalent of table[key] = value
    Find the appropriate bucket indicated by key and then append (key,value)
    to that bucket if the (key,value) pair doesn't exist yet in that bucket.
    If the bucket for key already has a (key,value) pair with that key,
    then replace the tuple with the new (key,value).
    Make sure that you are only adding (key,value) associations to the buckets.
    The type(value) can be anything. Could be a set, list, number, string, anything!
    """
    bucket_idx = hashcode(key) % len(table)
    for idx, (bucket_key, _) in enumerate(table[bucket_idx]):
        if key == bucket_key:
            table[bucket_idx][idx] = (key, value)
            break
    else:
        table[bucket_idx].append((key, value))


def htable_get(table, key):
    """
    Return the equivalent of table[key].
    Find the appropriate bucket indicated by the key and look for the
    association with the key. Return the value (not the key and not
    the association!). Return None if key not found.
    """
    for k, v in table[hashcode(key) % len(table)]:
        if key == k:
            return v
    return None


def htable_buckets_str(table):
    """
    Return a string representing the various buckets of this table.
    The output looks like:
        0000->
        0001->
        0002->
        0003->parrt:99
        0004->
    where parrt:99 indicates an association of (parrt,99) in bucket 3.
    """
    output = ""
    for table_idx, bucket in enumerate(table):
        output += "0" * (4 - len(str(table_idx))) + str(table_idx) + "->"
        if len(bucket) == 0:
            output += "\n"
        else:
            for bucket_idx, (key, val) in enumerate(bucket):
                output += f"{key}:{val}"
                if bucket_idx != len(bucket) - 1:
                    output += ", "
            output += "\n"
    return output


def htable_str(table):
    """
    Return what str(table) would return for a regular Python dict
    such as {parrt:99}. The order should be in bucket order and then
    insertion order within each bucket. The insertion order is
    guaranteed when you append to the buckets in htable_put().
    """
    output = "{"
    for bucket in table:
        for key, value in bucket:
            output += f"{key}:{value}, "
    output = output.strip(', ') + "}"
    return output
