### Important Note on EMR Version
Version must be 6.5.0 or this code will not work

In [None]:
import re
from pyspark.sql import SQLContext
from pyspark.sql.types import StructField, StructType, StringType, LongType, FloatType, IntegerType, Row
from pyspark.sql.functions import udf, col, column
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import round

In [None]:
# Change these to point to your own S3 locations
#   The first is a directory containing the text corpus
#   The second is a file containing the stopwords, one per line

books_directory = 's3://5330books/'
stopwords_file = 's3://cpsc5330s23/stopwords.txt'

In [None]:
# Read stopwords into a set, used by termify()

stopwords = set()
for word in sc.textFile(stopwords_file).collect():
    stopwords.add(word)
  

In [None]:
# Same versions of termify and get_docid as previous labs

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

def termify(line):
    terms = []
    words = re.findall(r'[^\W_]+', line)
    for word in words:
        lowered = word.lower()
        if (len(lowered) > 1) and (lowered not in stopwords) and (not re.search(r'^\d*$', lowered)):
            terms.append(lowered)
    return terms

def get_docid(filepath):
    return filepath.split('/')[-1][: -4]

# Spark SQL requires user-defined functions (UDFs).  Create a UDF from a function.
# Spark also needs to know the return type -- termify returns an array of strings

getDocidUDF = udf(lambda f: get_docid(f), StringType())
termifyUDF = udf(lambda t: termify(t), ArrayType(StringType()))

#### The Indexing Phase

All operations are packaged together into a single function, indexDocuments
*  Accepts:  directory containing the text documents
*  Returns:  a Data Frame with columns (term, docid, tfidf) and schema (string, string, float)

In [None]:
def indexDocuments(path):
    # Create RDD to read the text files (filepath, full_document_text)
    f = ????
    
    # Get the SQLContext to create a data frame from an RDD.  
    sq = SQLContext(sc)
    
    # The dataframe will have columns 'filename' and 'document', both are strings
    documents = sq.createDataFrame(?rdd?, ?schema?)
    
    # Run termify udf on the document.  Produces a new column 'terms' which is a list of strings
    d2 = ???
    
    # Explode the terms to get one row per term.  New data frame has columns 'document' and 'term'
    d3 = ???
    
    # Convert the 'filename' column to a 'docid'.  Drop the old column 'filename'
    d4 = d3.withColumn('docid', getDocidUDF(col('filename'))).drop('filename')
    
    # Term frequency is a data frame where we group by 'docid' and 'term' and count within each group.
    # Its columns are (docid, term, count)
    tf = ??
    
    # Doc Size is a dataframe where we group by 'docID' and sum the terms.  This dataframe should
    # have columns 'docid' and 'size' -- docid is a string, size is an int.  
    # Hint -- when you do a sum aggregation, the column will be named 'sum(count)' and you will rename
    # it to be 'size'
    docSize = ??
    
    # Tfnormed is normalized term frequency -- it is all of the tfidf formula except for dividing by 
    #  document frequency.  In other words, it will be 1000000.0 * term-count-per-document/document-size
    # It will have two columns (docid, tfnormed)  with types (string, float)
    tfnormed = ??
   
    # To get document frequency, start with term frequency, group by term, and count the number of docids.
    #  This dataframe should have columns (term, count)
    df = ??
    
    # To get the final data frame, join tfnormed with df, and divide.
    #  This final dataframe should have columns (term, docid, tfidf) with types (string, string, float)
    tfidf = ??
    
    return tfidf


In [None]:
# The previous cell just defined how to create the index data frame.
# Now we create it -- the call to cache() tells Spark to hold the data frame contents
# in memory if possible, so the rows can be re-used.

index = indexDocuments(books_directory)
index.cache()

In [None]:
# Just verify that the frame holds plausible data (right columns, right data types, reasonable values)
index.show(5)

#### Relevance Calculation

Relevance is a relationship between a "query" -- a string of words, and the index.
The query is parsed (termified) using the same function as was used to index documents.
Then in the Spark framework we can compute the TFIDF value for the query terms for all documents and select the top N.


In [None]:
# Inputs:
#    The query string (a string)
#    The index (a Data Frame in the form produced by indexDocuments)
#    Num_results (N) -- the number of search results to return
#
# Output:
#    A list of N tuples of the form (docid, relevance) which are the N most relevance documents
#     for the query, along with the relevance value for each document

#  Note that this output is a Python list, not a Spark object


def relevance(query, tfidf, num_results=5):
    # Create a list of terms by running termify on the query string
    queryTerms = list(termify(query))
    
    # These two lines create a DataFrame from the list of query terms.  The resulting 
    #  DataFrame has a single column 'term' which is a string
    rows = map(lambda t: Row(t), queryTerms)
    query = SQLContext(sc).createDataFrame(rows, StructType([StructField('term', StringType())]))
    
    # Join the query frame with the TFIDF frame on 'term'.  Group by docid and sum, which 
    #  is the total TFIDF for that term and that document.  Rename the sum of tfidf to be 'score',
    #  and truncate score to an int (as per the formula)
    #  The resulting DataFrame has columns (docid and score) of type (string and int)
    j = query.join(tfidf, 'term')\
        .drop('term')\
        .groupBy("docid")\
        .sum()\
        .withColumnRenamed('sum(tfidf)', 'score')

    # Sort the frame in descending order of score, take the first N elements,
    #  and format the return result to be a list of tuples of the form (docid, score)
    return ??


In [None]:
## Test it out!  
for query in ["buster, whale, king, and alice the rabbit!",
               "Take a whale to lunch this week!",
              "What would Jesus do about that?",
              "My name is Buster.  Deal with it.",
              "Bodice ripper?",
              "Why does it have to be sense OR sensibility, why can't it be sense AND sensibility"
              "What are leaves of grass anyway?",
              "??!?"]:
  print(query)
  for tuple in relevance(query, index):
        print("    " + str(tuple))