In [1]:

from pyspark import SparkConf, SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF

In [2]:

conf = SparkConf().setMaster('local').setAppName('tfidf_search')
sc = SparkContext(conf = conf)

In [3]:

raw_data = sc.textFile('DataScience-Python3/subset-small.tsv')
fields = raw_data.map(lambda x: x.split('\t'))
documents = fields.map(lambda x: x[3].split(' '))

In [4]:

document_names = fields.map(lambda x: x[1])

In [5]:

# Hash the words in each document to their term frequencies
hashing_tf = HashingTF(100000)
tf = hashing_tf.transform(documents)

In [6]:

# At this point we have an RDD of sparse vectors representing each document,
# where each value maps to the term frequency of each unique hash value

# We now compute the TF*IDF of each term in the document
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

In [7]:

# Now each value in the RDD is the TFxIDF of each unique hash value for each document
# We want to search the corpus for the best document for 'Gettysburg' where Lincoln gave a famous speech
gettysburg_TF = hashing_tf.transform(['Gettysburg'])
gettysburg_hash_val = gettysburg_TF.indices[0]

In [8]:

gettysburg_relevance = tfidf.map(lambda x: x[gettysburg_hash_val])
zipped_results = gettysburg_relevance.zip(document_names)

In [12]:

print ("Best document for Gettysburg is: ")
print (zipped_results.max())

SyntaxError: invalid syntax (<ipython-input-12-b8e44f5936c0>, line 3)