In [None]:
# Word Count Program using Apache Spark

# This notebook demonstrates how to perform a simple word count using Apache Spark. 
# The program reads a text file, 
# splits the lines into words, 
# counts the occurrences of each word, 
# and displays the results.

include '%pip install pyspark' if pyspark is not installed

In [None]:
import os

# Create the resources directory and sample file if they don't exist
os.makedirs("resources", exist_ok=True)
sample_file_path = "resources/samplefile_pyspark_wordcount.txt"
if not os.path.exists(sample_file_path):
    with open(sample_file_path, "w") as f:
        f.write("hello world\nhello spark\nhello world\n")



from pyspark import SparkContext

# Initialize SparkContext
# SparkContext is the entry point to any Spark functionality.
# It allows your Python program to connect to the Spark cluster and create RDDs (Resilient Distributed Datasets).
sc = SparkContext.getOrCreate()

# Read the text file
# Check if the file exists and is accessible before reading
sample_file_path = "resources/samplefile_pyspark_wordcount.txt"

text_file = sc.textFile(sample_file_path)

# Split each line into words
words = text_file.flatMap(lambda line: line.split())

# Map each word to a (word, 1) pair
word_pairs = words.map(lambda word: (word, 1))

# Reduce by key to count occurrences
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Collect and display the results
for word, count in word_counts.collect():
    print(f"{word}: {count}")

file: 4
for: 2
word: 1
Pyspark: 1
Programs.ipynb: 1
just: 1
Sample: 2
pyspark: 1
count: 1
program: 1
Refer: 1
is: 1
a: 1
reference: 1
