In [1]:
# Install PySpark and Java (Colab needs this for Apache Spark)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark



In [2]:
# Import necessary libraries and start Spark session
from pyspark import SparkContext, SparkConf
from google.colab import drive

In [3]:
# Set up SparkContext
conf = SparkConf().setAppName("Word Count").setMaster("local[*]")
sc = SparkContext(conf=conf)

In [4]:
# Mount Google Drive to access your files
drive.mount('/content/drive')

Mounted at /content/drive


# ***Q1. Using “q1_dataset.txt” as the input file to run the Spark program.***

## **Q1A. Count the number of occurrences for each word (word count).**

In [5]:
# Read the text file from Google Drive.
inp = sc.textFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/q1_dataset.txt")

print(inp.collect())  # To see the content of the file (optional)

['The Project Gutenberg eBook of Frankenstein; Or, The Modern Prometheus', '    ', 'This ebook is for the use of anyone anywhere in the United States and', 'most other parts of the world at no cost and with almost no restrictions', 'whatsoever. You may copy it, give it away or re-use it under the terms', 'of the Project Gutenberg License included with this ebook or online', 'at www.gutenberg.org. If you are not located in the United States,', 'you will have to check the laws of the country where you are located', 'before using this eBook.', '', 'Title: Frankenstein; Or, The Modern Prometheus', '', 'Author: Mary Wollstonecraft Shelley', '', 'Release date: October 1, 1993 [eBook #84]', '                Most recently updated: September 8, 2025', '', 'Language: English', '', 'Credits: Judith Boss, Christy Phillips, Lynn Hanninen and David Meltzer. HTML version by Al Haines.', '        Further corrections by Menno de Leeuw.', '', '', '*** START OF THE PROJECT GUTENBERG EBOOK FRANKENSTEIN; O

In [6]:
# Perform Word Count Operation
counts = inp.flatMap(lambda line: line.split(" ")) \
            .map(lambda word: (word, 1)) \
            .reduceByKey(lambda x, y: x + y)

In [7]:
# Collect and print the word count
result = counts.collect()
print(result)

[('Gutenberg', 22), ('eBook', 4), ('of', 2745), ('Frankenstein;', 4), ('Or,', 2), ('Modern', 3), ('', 1383), ('for', 485), ('use', 18), ('anyone', 7), ('United', 15), ('States', 7), ('and', 2968), ('most', 91), ('other', 85), ('world', 25), ('at', 302), ('no', 154), ('cost', 2), ('with', 700), ('almost', 39), ('restrictions', 2), ('give', 23), ('it', 372), ('re-use', 2), ('under', 29), ('License', 8), ('this', 337), ('online', 4), ('www.gutenberg.org.', 4), ('If', 48), ('you', 455), ('are', 187), ('States,', 4), ('will', 192), ('have', 362), ('to', 2144), ('country', 22), ('where', 79), ('before', 125), ('using', 7), ('Author:', 1), ('Mary', 2), ('Wollstonecraft', 2), ('date:', 1), ('October', 1), ('1,', 1), ('[eBook', 1), ('#84]', 1), ('September', 7), ('Credits:', 1), ('Judith', 1), ('Boss,', 1), ('Christy', 1), ('Lynn', 1), ('HTML', 1), ('version', 2), ('by', 467), ('Al', 1), ('Haines.', 1), ('Further', 1), ('corrections', 1), ('Leeuw.', 1), ('***', 4), ('START', 1), ('PROJECT', 4),

In [11]:
print(counts.getNumPartitions()) #How many partitions?

2


In [19]:
# Save the word count results in single file to Google Drive
counts.coalesce(1).saveAsTextFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/q1A_output")

## **Q1B. Find the word count of these specific words (case insensitive)** :
a. “frankenstein”, “monster”, “life”
b. Please perform necessary data preprocessing.

In [20]:
#Counts for selected words
targets = {"frankenstein", "monster", "life"}

In [21]:
#Build a dict: word -> count (0 if absent)
selected_counts = (
    counts.filter(lambda kv: kv[0] in targets)
           .collect()
)


In [22]:
# make sure all targets are included even if missing
result_B = {w: 0 for w in targets}
for w, c in selected_counts:
    result_B[w] = c

print("Q1B Results:")
for w, c in result_B.items():
    print(f"{w}: {c}")

Q1B Results:
frankenstein: 0
monster: 21
life: 65


In [23]:
# save to text file
sc.parallelize([(w, result_B[w]) for w in targets]) \
  .map(lambda kv: f"{kv[0]}\t{kv[1]}") \
  .coalesce(1).saveAsTextFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/q1B_output")

# **Q1C. Find the top 20 words with highest number of occurrences.**

In [26]:
#Top 20 most frequent words
top20 = counts.takeOrdered(20, key=lambda kv: -kv[1]) # sort desc by count
for i,(w,c) in enumerate(top20,1):
    print(f"{i:2d}. {w:15s} {c}")

 1. the             4066
 2. and             2968
 3. of              2745
 4. I               2719
 5. to              2144
 6. my              1631
 7. a               1394
 8.                 1383
 9. in              1129
10. was             994
11. that            986
12. with            700
13. had             679
14. which           547
15. but             541
16. me              529
17. not             505
18. his             500
19. as              486
20. for             485


In [28]:
(sc.parallelize(top20)
  .map(lambda kv: f"{kv[0]}\t{kv[1]}")
  .coalesce(1).saveAsTextFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/q1C_output"))