##### Assignment 2:  TFIDF Using Data Frames

Nina Nguyen  
CPSC 5330   
3/15/2020

In [2]:
sc

In [3]:
%fs ls FileStore/tables/documents

path,name,size
dbfs:/FileStore/tables/documents/bible_kjv-ef3b8.txt,bible_kjv-ef3b8.txt,4332554
dbfs:/FileStore/tables/documents/carroll_alice-ac78c.txt,carroll_alice-ac78c.txt,144395
dbfs:/FileStore/tables/documents/melville_moby_dick-7006e.txt,melville_moby_dick-7006e.txt,1265914
dbfs:/FileStore/tables/documents/shakespeare_macbeth-08fa9.txt,shakespeare_macbeth-08fa9.txt,100351
dbfs:/FileStore/tables/documents/whitman_leaves-013de.txt,whitman_leaves-013de.txt,711215


In [4]:
from pyspark.sql.types import StringType
from pyspark.sql.types import ArrayType
from pyspark.sql.functions import udf, col, lit
from pyspark.sql import SQLContext
from pyspark.sql.functions import explode
from pyspark.sql.functions import regexp_extract
import pyspark.sql.functions as func
from pyspark.sql.functions import length
from pyspark.sql.functions import min, max
from pyspark.sql.types import StringType
from pyspark.sql.functions import desc
from pyspark.sql.types import *

In [5]:
def splitRow(row):
  return row.strip().split()
splitRowUDF = udf(lambda r: splitRow(r), ArrayType(StringType()))

In [6]:
def termify(word):
  return ''.join([c for c in word.lower() if 97 <= ord(c) <= 122])
termifyUDF = udf(lambda w: termify(w), StringType())

In [7]:

#Read in corpus and calculate tfidf
#Write term, docid, tfidf to parquet file
def index(indir, outfile): 
  dbutils.fs.rm(outfile,True)
  textFiles = sc.wholeTextFiles(indir)
  sq = SQLContext(sc)
  df = sq.createDataFrame(textFiles).withColumnRenamed("_1", "pathname").withColumnRenamed("_2", "text")
  regex_str = "[\/]([^\/]+)$"
  df = df.withColumn("docid", regexp_extract("pathname",regex_str,1)).drop("pathname") #get file name only
  df = df.withColumn("words", splitRowUDF(col("text"))).drop("text")
  df = df.select(df.docid, explode(df.words).alias("word"))
  df = df.withColumn("term", termifyUDF(col("word"))).drop('word')
  df = df.filter(length(df.term) > 0) #Make sure word is atleast 1
  
  #create separate dataframe to help compute tfidf
  termDocCount = df.groupBy("term", "docid").count().withColumnRenamed("count", "termInDocCount")
  docLength = df.groupBy("docid").count().withColumnRenamed("count", "totalTermsInDoc")
  termDocFreq = df.distinct().groupBy('term').count().withColumnRenamed("count", "termInDocFreq")
  
  #Getting tfidf
  firstJoin = termDocCount.join(docLength, on =["docid"])
  tfidf = firstJoin.join(termDocFreq, on =["term"])
  tfidf = tfidf.withColumn("temp", ((tfidf.termInDocCount/tfidf.totalTermsInDoc)/tfidf.termInDocFreq))
  tfidf = tfidf.withColumn("tfidf", (tfidf.temp/(tfidf.select(max("temp")).collect()[0][0])*100))     
  #Write to output file in parquet format
  tfidf.select("term", "docid", "tfidf").write.save(outfile, format='parquet')
  

In [8]:
#Helper function to do TFIDF calculation for a single query line
#Output is dataframe with docid and score
def scoreQueryLine(queryLine, tfidfTable):
  myList = queryLine.split(" ")
  sq = SQLContext(sc)
  df = sq.createDataFrame(myList, StringType())
  df = df.withColumn("term", termifyUDF(col("value"))).drop('value')
  df = df.filter(length(df.term) > 0) #Make sure word is atleast 1
  final = df.join(tfidfTable, on = ["term"])
  docid_score = final.groupBy("docid").sum().withColumnRenamed("sum(tfidf)", "score").sort(desc("score"))
  return docid_score


In [9]:
#Takes in a query file and the output parquet file, calculate the TFIDF score 
#Returns a dataframe with query, docid, score
def scoreQueryFile(filename, tfidfFileName='FileStore/tfidf.parquet'):
  textFile = sc.textFile(filename)
  tfidf = spark.read.parquet(tfidfFileName)
  sq = SQLContext(sc)
  field = [StructField("query", StringType(), True),StructField("docid", StringType(), True),StructField("score", FloatType(), True)]
  schema = StructType(field)
  result = sq.createDataFrame(sc.emptyRDD(), schema)
  for line in textFile.collect():
    df = scoreQueryLine(line, tfidf)
    df = df.withColumn("query", lit(line))
    df = df.limit(1)
    #df.show()
    result = result.union(df.select(result.columns))
  return result


In [10]:
indexFileName = '/FileStore/tfidf.parquet'
index('/FileStore/tables/documents', indexFileName)
scoreQueryFile('/FileStore/tables/queries.txt', indexFileName).show()