In [None]:
# Install Spark

In [5]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

!pip install -q findspark
import findspark
findspark.init()

In [None]:
# Start Spark App

In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [49]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

In [None]:
# Read data

In [98]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv("tripadvisor_hotel_reviews.csv")
n_docs = df.count()
print(f"Total number of documents: {n_docs}")

Total number of documents: 20491


In [99]:
df = df.select(F.lower(F.col('Review')).alias('Review'))
df = df.withColumn("Review", F.regexp_replace("Review", r"[^A-Za-z0-9 ]+", ""))
df = df.select(F.split(F.col("Review")," ").alias("Review"))

In [None]:
# Calculate IDF

In [100]:
idf = df.withColumn("disctinct_review", F.array_distinct(F.col("Review"))).drop("Review")
idf = idf.withColumn("word_in_doc_count", F.lit("1").cast(IntegerType()))
idf = idf.withColumn("word", F.explode(F.col("disctinct_review"))).drop("disctinct_review")
idf = idf.filter(F.col("word")!="")

In [106]:
idf = idf.groupby(F.col("word")).agg(F.sum("word_in_doc_count").alias("word_in_doc_count"))
idf = idf.orderBy(F.col("word_in_doc_count").desc())
idf = idf.limit(100)
idf = idf.withColumn("idf", F.log10(n_docs/F.col("word_in_doc_count")))
idf = idf.select("word", "idf")

In [151]:
TOP_WORDS = [row.word for row in idf.select("word").collect()]

In [152]:
idf.show(5)

+---------+-------------------+
|     word|                idf|
+---------+-------------------+
|    hotel| 0.0988163885358417|
|     room|0.16379410705976583|
|      not| 0.2279530481576024|
|    staff|0.25003528242940504|
|    great|0.26938155880414666|
|     stay| 0.3074568300402548|
|     good|0.34415559672244006|
|   stayed| 0.3796478362386668|
|       nt|0.38837096289924544|
|    rooms|0.39050126252963435|
| location| 0.3996069642472259|
|     just|0.42304669224496766|
|    clean| 0.4279584923976205|
|     nice|0.44139343218347227|
|      did| 0.4539894491724172|
|breakfast| 0.4596324746796454|
|       no| 0.4784798191415699|
|    night| 0.5100908397984418|
|  service| 0.5172145494238303|
|     time| 0.5223350960525778|
+---------+-------------------+
only showing top 20 rows



In [None]:
# Calculate TF

In [157]:
df = df.withColumn("id", F.monotonically_increasing_id())
tf = df.withColumn("word_count", F.lit("1").cast(IntegerType()))
tf = tf.withColumn("word", F.explode(F.col("Review"))).drop("Review")
tf = tf.filter(F.col("word").isin(TOP_WORDS))
word_count_in_doc = tf.groupby(F.col("id"), F.col("word")).agg(F.sum("word_count").alias("word_count"))
doc_len = tf.groupby("id").agg(F.count("word").alias("doc_len"))
tf = word_count_in_doc.join(doc_len, on=["id"])
tf = tf.withColumn("tf", F.col("word_count")/F.col("doc_len"))
tf = tf.select("id", "word", "tf")

In [177]:
tf.show(5)

+---+--------+--------------------+
| id|    word|                  tf|
+---+--------+--------------------+
| 26|   staff|0.058823529411764705|
| 26|   loved|0.058823529411764705|
| 26|    days|0.058823529411764705|
| 26|bathroom|0.058823529411764705|
| 26|    stay|0.058823529411764705|
+---+--------+--------------------+
only showing top 5 rows



In [None]:
# Merge and get result

In [174]:
joined = tf.join(idf, on=["word"])
joined = joined.withColumn("tf_idf", F.col("tf")*F.col("idf"))
joined = joined.select("id","word","tf_idf")

In [175]:
tf_idf = joined.groupBy("id").pivot("word").agg(F.first(F.col("tf_idf")))
tf_idf = tf_idf.fillna(0.0)

In [176]:
tf_idf.show()

+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------