In [1]:
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

import org.apache.spark.rdd.RDD

In [2]:
val spark = SparkSession.builder()
    // адрес мастера
    .master("local[*]")
    // имя приложения в интерфейсе спарка
    .appName("made-demo")
//     .config("spark.executor.memory",  "2g")
//     .config("spark.executor.cores", "2")
//     .config("spark.driver.memory", "2g")
    .getOrCreate()

import spark.implicits._

spark = org.apache.spark.sql.SparkSession@8487697


org.apache.spark.sql.SparkSession@8487697

# Прочитаем датасет

In [3]:
val df = spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("tripadvisor_hotel_reviews.csv")
    .withColumn("review_id",monotonicallyIncreasingId)

df = [Review: string, Rating: int ... 1 more field]




[Review: string, Rating: int ... 1 more field]

Почистим текст

In [4]:
val df_cleaned = df
    .withColumn("Review", regexp_replace(col("Review"), "[,?!.*'0-9]", ""))
    .withColumn("Review", regexp_replace(col("Review"), "\\s+", " "))
    .withColumn("Review", lower(col("Review")))
    .withColumn("review", split(col("Review"), " "))
    .select(col("review"), col("review_id"))

df_cleaned = [review: array<string>, review_id: bigint]


[review: array<string>, review_id: bigint]

In [5]:
df_cleaned.show

+--------------------+---------+
|              review|review_id|
+--------------------+---------+
|[nice, hotel, exp...|        0|
|[ok, nothing, spe...|        1|
|[nice, rooms, not...|        2|
|[unique, great, s...|        3|
|[great, stay, gre...|        4|
|[love, monaco, st...|        5|
|[cozy, stay, rain...|        6|
|[excellent, staff...|        7|
|[hotel, stayed, h...|        8|
|[excellent, staye...|        9|
|[poor, value, sta...|       10|
|[nice, value, sea...|       11|
|[nice, hotel, goo...|       12|
|[nice, hotel, not...|       13|
|[great, hotel, ni...|       14|
|[horrible, custom...|       15|
|[disappointed, sa...|       16|
|[fantastic, stay,...|       17|
|[good, choice, ho...|       18|
|[hmmmmm, say, rea...|       19|
+--------------------+---------+
only showing top 20 rows



# Найдем 100 самых частых слов

In [6]:
val freq_words = df_cleaned
    .select(col("review"))
    .as[Array[String]]
    .flatMap(x => x)
    .groupBy(col("value"))
    .count()
    .orderBy(desc("count"))
    .limit(100)
    .select(col("value").as("word"))

freq_words = [word: string]


[word: string]

In [7]:
freq_words.show

+--------+
|    word|
+--------+
|   hotel|
|    room|
|     not|
|   great|
|        |
|      nt|
|    good|
|   staff|
|    stay|
|     did|
|    just|
|    nice|
|   rooms|
|      no|
|location|
|  stayed|
| service|
|    time|
|   beach|
|   night|
+--------+
only showing top 20 rows



# Найдем число вхождений частых слов в соответсвующий отзыв

In [8]:
val word2review_id_and_count = df_cleaned
    .select(col("review"), col("review_id"))
    .as[(Array[String], BigInt)]
    .flatMap { 
      case (x1, x2) => x1.map((_, x2))
    }
    .select(col("_1").as("word"), col("_2").as("review_id"))
    .join(freq_words, Seq("word"), "inner")
    .groupBy(col("word"), col("review_id"))
    .count()
//     .show

word2review_id_and_count = [word: string, review_id: decimal(38,0) ... 1 more field]


[word: string, review_id: decimal(38,0) ... 1 more field]

In [9]:
word2review_id_and_count.show

+----------+---------+-----+
|      word|review_id|count|
+----------+---------+-----+
|    stayed|        9|    1|
|      area|       22|    1|
|experience|       23|    1|
|      days|       47|    1|
|        no|       68|    1|
|     rooms|       71|    2|
|     great|       78|    2|
|      away|       79|    2|
|       bed|       80|    1|
|       bed|       94|    1|
|   service|       98|    1|
|        nt|      110|    3|
|     hotel|      125|    3|
|     close|      137|    1|
|       did|      139|    1|
|  location|      161|    1|
|       bit|      172|    1|
|     place|      185|    1|
|     staff|      196|    5|
|        nt|      222|    1|
+----------+---------+-----+
only showing top 20 rows



# Посчитаем tf

In [10]:
val tf = word2review_id_and_count
    .groupBy("review_id")
    .agg(
        sum(col("count")).as("n_words")
    )
    .join(word2review_id_and_count, Seq("review_id"), "right")
    .select(col("review_id"), col("word"), (col("count") / col("n_words")).as("tf"))

tf = [review_id: decimal(38,0), word: string ... 1 more field]


[review_id: decimal(38,0), word: string ... 1 more field]

In [11]:
tf.show

+---------+-----------+-----+
|review_id|       word|   tf|
+---------+-----------+-----+
|       75|        not|0.025|
|       75|       stay|0.025|
|       75|  breakfast|0.025|
|       75|      place| 0.05|
|       75|       walk|0.025|
|       75|     street|0.075|
|       75|      rooms| 0.05|
|       75|         no|0.025|
|       75|    morning|0.025|
|       75|restaurants|0.025|
|       75|           |0.025|
|       75|       best|0.025|
|       75|      hotel|  0.1|
|       75|     people|0.025|
|       75|       make|0.025|
|       75|       room| 0.05|
|       75| experience|0.025|
|       75|      great|0.075|
|       75|      price|0.025|
|       75|       want|  0.1|
+---------+-----------+-----+
only showing top 20 rows



# Посчитаем idf

In [12]:
val idf = word2review_id_and_count
    .groupBy(col("word"))
    .agg(
        count("*").as("n_doc_with_word")
    )
    .select(col("word"), (log(lit(df_cleaned.count) / col("n_doc_with_word"))).as("idf"))

idf = [word: string, idf: double]


[word: string, idf: double]

In [13]:
idf.show

+---------+-------------------+
|     word|                idf|
+---------+-------------------+
|    staff| 0.5758143082987789|
|   nights|  1.663635280607877|
|      day| 1.2847966476150348|
|      did| 1.0450717530287938|
|      got| 1.5939900373832527|
|  perfect|  2.165144995796763|
|    lobby| 2.0890034847375505|
|       nt| 0.8941378508607037|
|    rooms|  0.900242364401492|
|      not| 0.5247988061591388|
|fantastic| 2.1732594968664496|
|   hotels| 1.8444124355504574|
|      new| 2.0549048693111085|
|   lovely| 2.1962490150911482|
|     stay| 0.7087382992819972|
|    hotel|0.22783954389938352|
|   resort| 1.9975348376521498|
|      bit| 1.9351343919366228|
|   buffet| 2.0890034847375505|
|     area| 1.5978417513411076|
+---------+-------------------+
only showing top 20 rows



In [14]:
println(df_cleaned.count)
println(tf.count)
println(idf.count)

20491
461607
100


# Посчитаем tf-idf

In [15]:
val tf_idf = tf
    .join(idf, Seq("word"), "left")
    .select(col("review_id"), col("word"), (col("tf") * col("idf")).as("tf_idf"))

tf_idf = [review_id: decimal(38,0), word: string ... 1 more field]


[review_id: decimal(38,0), word: string ... 1 more field]

In [16]:
println(tf_idf.count)

461607


In [17]:
tf_idf.show

+-----------+------+--------------------+
|  review_id|  word|              tf_idf|
+-----------+------+--------------------+
|        215|nights|0.046212091127996575|
|        325|nights| 0.19572179771857376|
|       1171|nights|  0.0723319687220816|
|       1715|nights| 0.04377987580547044|
|       2252|nights| 0.03868919257227621|
|       3595|nights|0.035023900644376355|
| 8589936533|nights|0.055454509353595895|
| 8589936635|nights|0.050413190321450814|
| 8589937283|nights| 0.04377987580547044|
| 8589938265|nights|0.055454509353595895|
| 8589938790|nights| 0.06161612150399544|
| 8589939380|nights|  0.0594155457359956|
| 8589939437|nights|0.055454509353595895|
|17179872137|nights| 0.04893044942964344|
|17179872944|nights| 0.04496311569210478|
|        182|nights| 0.08755975161094089|
|        487|nights|0.009786089885928688|
|       1115|nights| 0.06654541122431508|
|       2299|nights| 0.15123957096435245|
|       2777|nights|0.030247914192870488|
+-----------+------+--------------

# Построим pivot таблицу

Честно говоря, я раньше никогда не работал с pivot таблицами и не до конца понимаю, что именно надо сделать. Надеюсь это то, что нужно.

In [18]:
val pivot_tf_idf = tf_idf
    .groupBy("review_id")
    .pivot("word")
    .agg(first(col("tf_idf")))

pivot_tf_idf = [review_id: decimal(38,0), : double ... 99 more fields]


[review_id: decimal(38,0), : double ... 99 more fields]

Она очень долго строится и ядро часто умирает в процессе, так что без вывода.

In [None]:
pivot_tf_idf.show