In [1]:
# Set the PySpark environment variables
import os
os.environ['SPARK_HOME'] = "/home/rajesh/CSL7100/PySpark/spark-3.4.2-bin-hadoop3"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [3]:
spark = SparkSession.builder \
    .appName("DataFrame-Gutenberg-Similarity") \
    .master("local[6]") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "24") \
    .getOrCreate()

26/02/13 14:26:30 WARN Utils: Your hostname, rajesh-pc resolves to a loopback address: 127.0.1.1; using 192.168.0.39 instead (on interface wlp1s0)
26/02/13 14:26:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/13 14:26:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/13 14:26:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Using RDDs

In [4]:
from pyspark.sql.functions import regexp_extract

books_df = (
    spark.sparkContext
         .wholeTextFiles("/home/rajesh/CSL7100/Assignment1/data/D184MB/*.txt") #reads the whole text files from specified path
         .toDF(["file_path", "text"]) #convert RDD to data frame
         .withColumn(
             "file_name",  # add a new column calle file_name and extract it using regular expression
             regexp_extract("file_path", r"([^/]+$)", 1)
         )
         .select("file_name", "text") #keep file_name and text column
)



                                                                                

In [5]:
books_df.printSchema()


root
 |-- file_name: string (nullable = true)
 |-- text: string (nullable = true)



In [6]:
row = books_df.first()

print("File name:", row.file_name)
print("Text preview:\n", row.text[:100])   # first 1000 chars only

[Stage 1:>                                                          (0 + 1) / 1]

File name: 10.txt
Text preview:
 The Project Gutenberg EBook of The King James Bible

This eBook is for the use of anyone anywhere 


26/02/13 14:26:50 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 1 (TID 1): Attempting to kill Python Worker
                                                                                

In [7]:
#import builtin function for cleaning and formating the text
from pyspark.sql.functions import (
    regexp_replace,
    lower,
    col,
    split,
)
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import split, expr

In [8]:
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import split, expr

books_clean = (
    books_df
    .withColumn(
        "clean_text",
        regexp_replace(                  #remove the header
            col("text"),
            r"(?is)^.*?\*\*\*\s*START OF.*?\*\*\*",
            ""
        )
    )
    .withColumn(
        "clean_text",
        regexp_replace(                 #remove the footer
            col("clean_text"),
            r"(?is)\*\*\*\s*END OF.*?\*\*\*.*$",
            ""
        )
    )
    .withColumn("clean_text", lower(col("clean_text")))         #convert the text to lower case
    .withColumn(
        "clean_text",
        regexp_replace(col("clean_text"), r"[^a-z\s]", " ")     #remove the punctuations
    )
    .withColumn("words", split(col("clean_text"), r"\s+"))      #tokenize into words array
)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="tokens")
books_clean = remover.transform(books_clean)

books_clean = books_clean.select("file_name", "tokens")  # keep file_name and tokenized words


In [9]:
print("book count = ", books_clean.count())
books_clean.show(5, truncate=100)

                                                                                

book count =  425


26/02/13 14:27:00 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 5 (TID 5): Attempting to kill Python Worker
                                                                                

+---------+----------------------------------------------------------------------------------------------------+
|file_name|                                                                                              tokens|
+---------+----------------------------------------------------------------------------------------------------+
|   10.txt|[, old, testament, king, james, version, bible, first, book, moses, called, genesis, beginning, g...|
|  101.txt|[, hacker, crackdown, law, disorder, electronic, frontier, bruce, sterling, contents, preface, el...|
|  102.txt|[, produced, anonymous, volunteer, tragedy, pudd, nhead, wilson, mark, twain, whisper, reader, ch...|
|  103.txt|[, around, world, eighty, days, contents, chapter, phileas, fogg, passepartout, accept, one, mast...|
|  104.txt|[, inaugural, address, franklin, delano, roosevelt, given, washington, d, c, march, th, president...|
+---------+-------------------------------------------------------------------------------------

In [10]:
books_df.unpersist()

DataFrame[file_name: string, text: string]

In [12]:
books_df.unpersist(blocking=True) if books_df.is_cached else None
del books_df


In [13]:
spark.catalog.clearCache()


In [14]:
books_clean = books_clean.repartition(6, "file_name").cache()
books_clean.count()   # materialize cache


                                                                                

425

2. TF- IDF calculation

In [15]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql.functions import udf, col, lit, explode
from pyspark.sql.types import DoubleType
import numpy as np
from pyspark.sql.functions import explode, countDistinct, log, lit, col, count, size


### Calculate the Term Frequency (TF) of each word in each book

In [16]:
from pyspark.sql.functions import explode, col, count

tf_df = (
    books_clean
    .select("file_name", explode(col("tokens")).alias("word"))
    .groupBy("file_name", "word")
    .agg(count("*").alias("term_count"))
)


In [20]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec = Window.partitionBy("file_name").orderBy(desc("term_count"))

top10_per_book = (
    tf_df
    .withColumn("rank", row_number().over(window_spec))  # assign rank per book
    .filter("rank <= 10")  # keep top 10
    .drop("rank")  # remove helper column
)

top10_per_book.show(truncate=False)


+---------+---------+----------+
|file_name|word     |term_count|
+---------+---------+----------+
|102.txt  |en       |344       |
|102.txt  |de       |298       |
|102.txt  |tom      |294       |
|102.txt  |said     |252       |
|102.txt  |wilson   |223       |
|102.txt  |one      |202       |
|102.txt  |dat      |156       |
|102.txt  |got      |136       |
|102.txt  |man      |117       |
|102.txt  |old      |103       |
|112.txt  |one      |92        |
|112.txt  |gretchen |84        |
|112.txt  |jurgen   |76        |
|112.txt  |professor|65        |
|112.txt  |back     |54        |
|112.txt  |said     |48        |
|112.txt  |like     |46        |
|112.txt  |time     |46        |
|112.txt  |hand     |45        |
|112.txt  |viola    |42        |
+---------+---------+----------+
only showing top 20 rows



                                                                                

### Calculate the Inverse Document Frequency (IDF) for each word across all books.

In [21]:
from pyspark.sql.functions import countDistinct

N = tf_df.select("file_name").distinct().count()

print("Total files counts N ", N)

Total files counts N  425


In [22]:
#Number of files in which a word appeared

df_word = (
    tf_df            #term frquency df
    .groupBy("word")  #group by word
    .agg(countDistinct("file_name").alias("doc_freq"))  #count unique documents
)

df_word.show(truncate=False)

[Stage 23:>                                                         (0 + 6) / 6]

+------------+--------+
|word        |doc_freq|
+------------+--------+
|spirit      |339     |
|doubt       |342     |
|matters     |299     |
|every       |395     |
|rewritten   |15      |
|eye         |347     |
|ago         |352     |
|still       |387     |
|del         |44      |
|build       |245     |
|old         |385     |
|honeysuckles|4       |
|palings     |21      |
|stood       |358     |
|boxes       |169     |
|grew        |324     |
|made        |396     |
|symbol      |98      |
|chief       |302     |
|lofty       |168     |
+------------+--------+
only showing top 20 rows



                                                                                

In [24]:
#compute IDF 
from pyspark.sql.functions import log

idf_df = df_word.withColumn(
    "idf",
    log(lit(N) / (col("doc_freq") + lit(1)))    # compute log(N/df)
)


In [28]:
idf_df.show(truncate=False)

[Stage 28:>                                                         (0 + 6) / 6]

+------------+--------+-------------------+
|word        |doc_freq|idf                |
+------------+--------+-------------------+
|spirit      |339     |0.22314355131420976|
|doubt       |342     |0.21435872175847698|
|matters     |299     |0.3483066942682158 |
|every       |395     |0.07067495766993637|
|rewritten   |15      |3.2795004466846356 |
|eye         |347     |0.19988668914994248|
|ago         |352     |0.1856211119911201 |
|still       |387     |0.09108382930114349|
|del         |44      |2.245426679154097  |
|build       |245     |0.5467576329920539 |
|old         |385     |0.0962517994595859 |
|honeysuckles|4       |4.442651256490317  |
|palings     |21      |2.961046715566101  |
|stood       |358     |0.16876678043613805|
|boxes       |169     |0.9162907318741551 |
|grew        |324     |0.26826398659467937|
|made        |396     |0.0681528882372264 |
|symbol      |98      |1.456969318789827  |
|chief       |302     |0.3383563634150477 |
|lofty       |168     |0.9221904

                                                                                

### Compute the TF-IDF score for each word in each book (TF * IDF)

In [30]:
from pyspark.sql.functions import col

tf_idf_df = (
    tf_df     # term frequency data
    .join(idf_df.select("word", "idf"),  # join with IDF
          on="word", how="inner")  # join on word
    .withColumn("tf_idf", # create TF-IDF column
                col("term_count") * col("idf"))   # multiply TF and IDF
    .select("file_name", "word", "tf_idf")  # select final columns
)


In [31]:
tf_idf_df.show(truncate=False)



+---------+-----------+------------------+
|file_name|word       |tf_idf            |
+---------+-----------+------------------+
|38.txt   |aaargh     |10.717883976728944|
|200.txt  |aad        |5.358941988364472 |
|80.txt   |aag        |5.358941988364472 |
|200.txt  |aak        |5.358941988364472 |
|124.txt  |abadias    |5.358941988364472 |
|14.txt   |abaiang    |4.260329699696362 |
|25.txt   |abaiang    |4.260329699696362 |
|48.txt   |abaiang    |4.260329699696362 |
|180.txt  |abaiang    |4.260329699696362 |
|87.txt   |abaiang    |4.260329699696362 |
|200.txt  |abailard   |4.953476880256307 |
|267.txt  |abailard   |4.953476880256307 |
|200.txt  |abaissement|5.358941988364472 |
|24.txt   |abandonedly|5.358941988364472 |
|200.txt  |abaris     |5.358941988364472 |
|228.txt  |abas       |24.767384401281536|
|227.txt  |abas       |9.906953760512614 |
|102.txt  |abashed    |1.7346010553881064|
|172.txt  |abashed    |1.7346010553881064|
|224.txt  |abashed    |3.4692021107762128|
+---------+

                                                                                

In [12]:
df_tf = (
    books_clean
    .select("file_name", explode(col("tokens")).alias("word"))
    .groupBy("file_name", "word")
    .agg(count("*").alias("tf"))
)

df_tf.show(10)




+---------+--------+----+
|file_name|    word|  tf|
+---------+--------+----+
|   10.txt|     one|1969|
|   10.txt|    make|1056|
|   10.txt|    rain| 102|
|   10.txt| watered|  11|
|   10.txt|breathed|   4|
|   10.txt|   river| 179|
|   10.txt|  parted|  12|
|   10.txt|  freely|  17|
|   10.txt|    gave| 465|
|   10.txt|   naked|  47|
+---------+--------+----+
only showing top 10 rows



                                                                                

In [13]:
print("\n=== TF Scores for book '10.txt' (Top 20) ===")
tf_df.filter(col("file_name") == "10.txt").orderBy(col("tf").desc()).show(20, truncate=False)


=== TF Scores for book '10.txt' (Top 20) ===


NameError: name 'tf_df' is not defined

### Step 2: Calculate IDF for each word across all books

In [19]:
print("\n=== Calculating IDF (Inverse Document Frequency) ===")
total_books = books_clean.count()
print(f"Total number of books: {total_books}")

idf_df = (
    books_clean
    .select("file_name", explode(col("tokens")).alias("word"))
    .groupBy("word")
    .agg(countDistinct("file_name").alias("num_books"))
    .withColumn("idf", log(lit(total_books) / col("num_books")))
)



=== Calculating IDF (Inverse Document Frequency) ===
Total number of books: 425


In [20]:
print("\n=== IDF Scores (Top 20 rarest words) ===")
idf_df.select("word", "num_books", "idf").orderBy(col("idf").desc()).show(20, truncate=False)



=== IDF Scores (Top 20 rarest words) ===




+----------------+---------+-----------------+
|word            |num_books|idf              |
+----------------+---------+-----------------+
|choijilsurengiyn|1        |6.052089168924417|
|susimilkie      |1        |6.052089168924417|
|hydrogene       |1        |6.052089168924417|
|gutzkow         |1        |6.052089168924417|
|christines      |1        |6.052089168924417|
|weatherboards   |1        |6.052089168924417|
|baldi           |1        |6.052089168924417|
|prosecutorial   |1        |6.052089168924417|
|isoceles        |1        |6.052089168924417|
|trampish        |1        |6.052089168924417|
|unsexual        |1        |6.052089168924417|
|deadlit         |1        |6.052089168924417|
|nola            |1        |6.052089168924417|
|undertrained    |1        |6.052089168924417|
|quintessentially|1        |6.052089168924417|
|warriorship     |1        |6.052089168924417|
|agnibilckrou    |1        |6.052089168924417|
|chailey         |1        |6.052089168924417|
|suddarth    

                                                                                

In [21]:
print("\n=== IDF Scores (Top 20 most common words) ===")
idf_df.select("word", "num_books", "idf").orderBy(col("idf").asc()).show(20, truncate=False)



=== IDF Scores (Top 20 most common words) ===


[Stage 31:>                                                         (0 + 2) / 2]

+---------+---------+--------------------+
|word     |num_books|idf                 |
+---------+---------+--------------------+
|         |425      |0.0                 |
|end      |423      |0.004716989878138867|
|project  |405      |0.048202101817877686|
|gutenberg|402      |0.05563708030539568 |
|time     |401      |0.058127741617847686|
|one      |401      |0.058127741617847686|
|may      |400      |0.06062462181643484 |
|first    |398      |0.06563716363997904 |
|made     |396      |0.07067495766993637 |
|well     |395      |0.07320340402329492 |
|every    |395      |0.07320340402329492 |
|even     |394      |0.07573825962648298 |
|many     |394      |0.07573825962648298 |
|two      |394      |0.07573825962648298 |
|make     |394      |0.07573825962648298 |
|way      |393      |0.07827955705515566 |
|long     |392      |0.08082732913395425 |
|come     |392      |0.08082732913395425 |
|place    |392      |0.08082732913395425 |
|must     |392      |0.08082732913395425 |
+---------+

                                                                                

# Step 3: Calculate TF-IDF = TF Ã— IDF

In [22]:
print("\n=== Calculating TF-IDF ===")
tfidf_df = (
    tf_df
    .join(idf_df, "word")
    .withColumn("tfidf", col("tf") * col("idf"))
    .select("file_name", "word", "word_count", "total_words", "tf", "num_books", "idf", "tfidf")
)


=== Calculating TF-IDF ===


In [23]:
print("\n=== TF, IDF, TF-IDF Together (Top 20 highest TF-IDF - all books) ===")
tfidf_df.select("file_name", "word", "tf", "idf", "tfidf").orderBy(col("tfidf").desc()).show(20, truncate=False)


=== TF, IDF, TF-IDF Together (Top 20 highest TF-IDF - all books) ===


26/02/13 12:25:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
26/02/13 12:25:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
26/02/13 12:25:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+---------+---------+--------------------+------------------+-------------------+
|file_name|word     |tf                  |idf               |tfidf              |
+---------+---------+--------------------+------------------+-------------------+
|302.txt  |digits   |0.9631782945736435  |3.48713981146288  |3.3587373765446733 |
|212.txt  |digits   |0.4372623574144487  |3.48713981146288  |1.524794974594035  |
|115.txt  |byear    |0.1945780498469611  |6.052089168924417 |1.1776037079892285 |
|115.txt  |ayear    |0.1945780498469611  |6.052089168924417 |1.1776037079892285 |
|88.txt   |est      |0.6373504652193177  |1.6094379124341003|1.0257760022314812 |
|15.txt   |moby     |0.13195098963242224 |4.442651256490317 |0.5862122298856214 |
|255.txt  |qread    |0.08196721311475409 |6.052089168924417 |0.4960728826987227 |
|239.txt  |png      |0.07142857142857142 |6.052089168924417 |0.4322920834946012 |
|239.txt  |radar    |0.10714285714285714 |3.8548645915881976|0.41302120624159255|
|276.txt  |zurfl

                                                                                

print("\n=== TF, IDF, TF-IDF for book '10.txt' (Top 20) ===")
tfidf_df.filter(col("file_name") == "10.txt").select("word", "tf", "idf", "tfidf").orderBy(col("tfidf").desc()).show(20, truncate=False)


In [25]:
print("\n=== Complete Details for book '10.txt' (Top 20) ===")
tfidf_df.filter(col("file_name") == "10.txt").orderBy(col("tfidf").desc()).show(20, truncate=False)


=== Complete Details for book '10.txt' (Top 20) ===




+---------+---------+----------+-----------+---------------------+---------+-------------------+---------------------+
|file_name|word     |word_count|total_words|tf                   |num_books|idf                |tfidf                |
+---------+---------+----------+-----------+---------------------+---------+-------------------+---------------------+
|10.txt   |unto     |8997      |375131     |0.02398362172147863  |166      |0.9401013805678735 |0.0225470358913797   |
|10.txt   |israel   |2575      |375131     |0.006864268748783758 |68       |1.8325814637483102 |0.01257933167120792  |
|10.txt   |thou     |5474      |375131     |0.01459223577896788  |188      |0.8156472060944675 |0.011902116343786876 |
|10.txt   |thy      |4600      |375131     |0.012262383007536034 |178      |0.8703056186323317 |0.01067202082928024  |
|10.txt   |thee     |3827      |375131     |0.010201769515182696 |171      |0.910425612421757  |0.009287952258645817 |
|10.txt   |saith    |1262      |375131     |0.00

                                                                                

## Step 4 Summary statistics

In [26]:
# Step 4: Summary Statistics
print("\n=== Summary Statistics ===")
print("TF Statistics:")
tf_df.select("tf").describe().show()

print("IDF Statistics:")
idf_df.select("idf").describe().show()

print("TF-IDF Statistics:")
tfidf_df.select("tfidf").describe().show()


=== Summary Statistics ===
TF Statistics:


                                                                                

+-------+--------------------+
|summary|                  tf|
+-------+--------------------+
|  count|             2409637|
|   mean|1.763751137619456...|
| stddev|0.001165423237910...|
|    min|1.412114245690933...|
|    max|  0.9631782945736435|
+-------+--------------------+

IDF Statistics:


                                                                                

+-------+------------------+
|summary|               idf|
+-------+------------------+
|  count|            210899|
|   mean| 5.129266120670784|
| stddev|1.3494211538327372|
|    min|               0.0|
|    max| 6.052089168924417|
+-------+------------------+

TF-IDF Statistics:


26/02/13 12:27:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
26/02/13 12:27:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+-------+--------------------+
|summary|               tfidf|
+-------+--------------------+
|  count|             2409637|
|   mean|1.881274890776004E-4|
| stddev|0.003196774853135...|
|    min|                 0.0|
|    max|  3.3587373765446733|
+-------+--------------------+



                                                                                

In [14]:
spark.stop()