In [3]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, regexp_replace, monotonically_increasing_id, collect_list
spark = SparkSession.builder.master("local").appName("word count").getOrCreate()

In [29]:
rdd = spark.sparkContext.textFile("cleaned_*.txt")

In [30]:
rdd_after_map = rdd.flatMap(lambda text: [(word, 1) for word in text.split()]).combineByKey(lambda x: x, lambda count1, count2: count1+count2, lambda acc1, acc2: acc1+acc2)
print(rdd_after_map.collect())



In [31]:
rdd_after_reduce = rdd_after_map.reduceByKey(lambda x, y: x + y)
print(rdd_after_reduce.collect())



In [32]:
top_20_words = rdd_after_reduce.sortBy(lambda x: x[1], ascending=False).take(20)
for word, count in top_20_words:
  print(f"{word}: {count}")

the: 6800
of: 3593
are: 3180
and: 3089
to: 3039
a: 2760
in: 2064
is: 1924
that: 1586
no: 1532
all: 1358
i: 1332
it: 1258
x: 1246
y: 1129
some: 1098
with: 1082
be: 1073
you: 1003
not: 913


In [39]:
#MapReduce for wordCount after remove stop words
rdd_2 = spark.sparkContext.textFile("stopwords_cleaned_*.txt")

In [40]:
rdd_2_after_map = rdd_2.flatMap(lambda text: [(word, 1) for word in text.split()]).combineByKey(lambda count:count, lambda count1, count2: count1+count2, lambda acc1, acc2: acc1+acc2)
print(rdd_2_after_map.collect())



In [41]:
rdd_2_after_reduce = rdd_2_after_map.reduceByKey(lambda x, y: x+y)
print(rdd_2_after_reduce.collect())



In [42]:
top_20_words = rdd_2_after_reduce.sortBy(lambda x: x[1], ascending=False).take(20)
for word, count in top_20_words:
  print(f"{word}: {count}")

one: 614
project: 430
may: 415
ie: 362
two: 357
said: 299
proposition: 299
would: 293
gutenbergtm: 280
work: 275
propositions: 268
exist: 251
xy: 250
us: 243
old: 224
conclusion: 223
like: 217
must: 215
things: 214
univ: 213


### Inverted index

In [None]:
file_paths = ["stopwords_cleaned_cleaned_01.txt", "stopwords_cleaned_cleaned_02.txt", "stopwords_cleaned_cleaned_03.txt", "stopwords_cleaned_cleaned_04.txt", "stopwords_cleaned_cleaned_05.txt"]

In [None]:
documents_df = spark.read.text(file_paths).withColumnRenamed("value", "text")
documents_df = documents_df.withColumn("doc_id", monotonically_increasing_id())

words_df = documents_df.select("doc_id", explode(split(regexp_replace("text", "[^\w\s]", ""), "\s+")).alias("word"))
inverted_index_df = words_df.groupBy("word").agg(collect_list("doc_id").alias("doc_ids"))

In [None]:
inverted_index_df.show()

+-------------+--------------------+
|         word|             doc_ids|
+-------------+--------------------+
|   abruptness|                 [1]|
|    arguments|        [0, 3, 3, 3]|
|          art|[0, 0, 0, 0, 1, 1...|
|     bowsprit|  [2, 2, 2, 4, 4, 4]|
|     brackets|                 [1]|
|     briefest|           [0, 0, 0]|
|     cautious|              [1, 3]|
|   concluding|                 [1]|
|    connected|     [1, 1, 2, 2, 4]|
|       doubts|                 [0]|
|      flashed|                 [1]|
|    forgetful|                 [0]|
|      fritter|              [2, 4]|
|halfpensioner|                 [1]|
|       harder|[0, 2, 2, 2, 3, 3...|
|         hope|[0, 0, 0, 0, 0, 0...|
|        hynas|[0, 0, 0, 0, 0, 0...|
|    imitation|                 [2]|
|      inanity|              [2, 4]|
|        inner|[0, 0, 0, 0, 0, 0...|
+-------------+--------------------+
only showing top 20 rows



In [None]:
logic_index = inverted_index_df.filter(inverted_index_df.word == 'logic')

# Show the inverted index for the word 'book'
logic_index.show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------+
|word |doc_ids                                                                                                                                                     |
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------+
|logic|[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0]|
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------+

