In [34]:
!pip install pyspark



In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, regexp_replace, split, col, count, collect_list, struct

# Initialize a Spark session
spark = SparkSession.builder.appName("InvertedIndex").getOrCreate()

# Assuming you have a DataFrame with columns DocumentID and text
# Replace 'your_documents.csv' with the actual path to your data
documents = spark.read.option("header", "true" )\
                                              .option("inferSchema", "true" )\
                                              .option('delimiter',"\t")\
                                              .csv('/content/document.csv', header=True)

In [36]:
documents.show()

+----------+--------------------+
|DocumentID|                text|
+----------+--------------------+
|         1|Are you yearning ...|
|         2|In the fast-paced...|
|         3|A holistic approa...|
|         4|The urgency of en...|
|         5|Whether you're a ...|
|         6|Education is a po...|
|         7|Embark on a culin...|
|         8|Creativity knows ...|
|         9|Building strong a...|
|        10|For those who cra...|
+----------+--------------------+



In [37]:
word = documents.select("DocumentID", explode(split(regexp_replace(col("text"), "[^\w\s]", ""), "\\s+")).alias("word"))
word.show()

+----------+----------+
|DocumentID|      word|
+----------+----------+
|         1|       Are|
|         1|       you|
|         1|  yearning|
|         1|       for|
|         1| adventure|
|         1|       and|
|         1|         a|
|         1|     break|
|         1|      from|
|         1|       the|
|         1|   mundane|
|         1|    Embark|
|         1|        on|
|         1|         a|
|         1|   journey|
|         1|        to|
|         1|  discover|
|         1|       the|
|         1|enchanting|
|         1|    beauty|
+----------+----------+
only showing top 20 rows



In [38]:
count=word.groupBy("DocumentID","word").count()
count.show()
index=word.join(count, count.word==word.word, 'inner').drop(word.word,word.DocumentID).distinct
index.show()

+----------+-------------+-----+
|DocumentID|         word|count|
+----------+-------------+-----+
|         4|        about|    1|
|        10|          For|    1|
|         9|         time|    1|
|         9|relationships|    1|
|        10|thrillseeking|    1|
|         5| professional|    3|
|        10|       limits|    1|
|         2|         tech|    1|
|         3|            a|    3|
|         3|      extreme|    1|
|        10|      outdoor|    1|
|         7|       global|    1|
|         2|      connect|    1|
|         8|      Embrace|    1|
|        10|   activities|    1|
|         3|         diet|    1|
|         4|      urgency|    1|
|         7|           to|    2|
|         9|      virtual|    1|
|         3|           on|    1|
+----------+-------------+-----+
only showing top 20 rows

+----------+-------------+-----+
|DocumentID|         word|count|
+----------+-------------+-----+
|         4|        about|    1|
|        10|          For|    1|
|         9|     

In [39]:
inverted_index_df = index.groupBy("word").agg(collect_list(struct("DocumentID", "count")).alias("documents"))
inverted_index_df.show()

+----------+--------------------+
|      word|           documents|
+----------+--------------------+
|         A|            [{3, 1}]|
|       Are|            [{1, 1}]|
|  Building|            [{9, 1}]|
|Creativity|            [{8, 1}]|
|     Delve|            [{6, 1}]|
|  Discover|[{8, 1}, {8, 1}, ...|
| Education|            [{6, 1}]|
|    Embark|[{1, 1}, {1, 1}, ...|
|   Embrace|[{8, 1}, {8, 1}, ...|
|   Explore|[{9, 1}, {9, 1}, ...|
|       For|           [{10, 1}]|
|      From|[{10, 1}, {10, 1}...|
|        In|            [{2, 1}]|
|     Learn|            [{4, 1}]|
|      Stay|            [{2, 1}]|
|       The|            [{4, 1}]|
|  Together|            [{4, 1}]|
|   Whether|            [{5, 1}]|
|         a|[{4, 1}, {4, 1}, ...|
|    abound|            [{2, 1}]|
+----------+--------------------+
only showing top 20 rows

