In [None]:
# Word Count Program using Apache Spark

# This notebook demonstrates how to perform a simple word count using Apache Spark. 
# The program reads a text file, 
# splits the lines into words, 
# counts the occurrences of each word, 
# and displays the results.

from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("PysparkExample").getOrCreate()

# Example: Create a DataFrame
data = [(1, "Alice"), (2, "Bob"), (3, "Charlie")]
df = spark.createDataFrame(data, ["id", "name"])
df.show()

# You can still access the underlying SparkContext if needed:
sc = spark.sparkContext
print(sc)

include '%pip install pyspark' if pyspark is not installed

In [1]:
import os

# Create the resources directory and sample file if they don't exist
os.makedirs("resources", exist_ok=True)
sample_file_path = "resources/samplefile_pyspark_wordcount.txt"
if not os.path.exists(sample_file_path):
    with open(sample_file_path, "w") as f:
        f.write("hello world\nhello spark\nhello world\n")

from pyspark.sql import SparkSession

# Initialize SparkSession
# SparkSession is the entry point to programming Spark with the Dataset and DataFrame API.
# It allows you to create DataFrames, register DataFrames as tables, and execute SQL over tables.
spark = SparkSession.builder \
    .appName("WordCount") \
    .getOrCreate()

# Read the text file
# Check if the file exists and is accessible before reading
sample_file_path = "resources/samplefile_pyspark_wordcount.txt"

# Using SparkContext to read the text file
sc = spark.sparkContext
text_file = sc.textFile(sample_file_path)

# Split each line into words
words = text_file.flatMap(lambda line: line.split())

# Map each word to a (word, 1) pair
word_pairs = words.map(lambda word: (word, 1))

# Reduce by key to count occurrences
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Collect and display the results
for word, count in word_counts.collect():
    print(f"{word}: {count}")

# Example: Create a DataFrame from a list of tuples
sample_data = [(101, 'Math', 90), (102, 'Science', 85), (103, 'English', 88)]
df = spark.createDataFrame(sample_data, ["student_id", "subject", "marks"])
df.show()

# Example: Perform a simple transformation
# Filter students with marks greater than 85
filtered_df = df.filter(df.marks > 85)
filtered_df.show()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/20 11:31:46 WARN Utils: Your hostname, Nikhils-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.37 instead (on interface en0)
25/06/20 11:31:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/20 11:31:46 WARN Utils: Your hostname, Nikhils-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.37 instead (on interface en0)
25/06/20 11:31:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust l

file: 4
for: 2
word: 1
Pyspark: 1
Programs.ipynb: 1
just: 1
Sample: 2
pyspark: 1
count: 1
program: 1
Refer: 1
is: 1
a: 1
reference: 1
+----------+-------+-----+
|student_id|subject|marks|
+----------+-------+-----+
|       101|   Math|   90|
|       102|Science|   85|
|       103|English|   88|
+----------+-------+-----+

+----------+-------+-----+
|student_id|subject|marks|
+----------+-------+-----+
|       101|   Math|   90|
|       102|Science|   85|
|       103|English|   88|
+----------+-------+-----+

+----------+-------+-----+
|student_id|subject|marks|
+----------+-------+-----+
|       101|   Math|   90|
|       103|English|   88|
+----------+-------+-----+

+----------+-------+-----+
|student_id|subject|marks|
+----------+-------+-----+
|       101|   Math|   90|
|       103|English|   88|
+----------+-------+-----+

