In [4]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
 
# if __name__ == "__main__":
spark = SparkSession\
    .builder\
    .appName("TfIdf Example")\
    .getOrCreate()

sentenceData = spark.createDataFrame([
    (0.0, "Welcome to TutorialKart."),
    (0.0, "Learn Spark at TutorialKart."),
    (1.0, "Spark Mllib has TF-IDF.")
], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "rawFeatures","features").show()
 
# spark.stop() 

+-----+--------------------+--------------------+
|label|         rawFeatures|            features|
+-----+--------------------+--------------------+
|  0.0|(20,[4,8,9],[1.0,...|(20,[4,8,9],[0.28...|
|  0.0|(20,[4,5,15,16],[...|(20,[4,5,15,16],[...|
|  1.0|(20,[0,1,5,14],[1...|(20,[0,1,5,14],[0...|
+-----+--------------------+--------------------+

