In [3]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName("Word Count Example").getOrCreate()

Before Optmization

In [4]:
# Read a csv file
df = spark.read.csv("Employee.csv", header=True, inferSchema=True)

# Convert DataFrame to RDD
rdd = df.rdd.map(lambda row: " ".join([str(item) for item in row]))

# Count word occurrences
word_counts = rdd.flatMap(lambda line: line.split(" ")) \
                 .map(lambda word: (word, 1)) \
                 .reduceByKey(lambda a, b: a + b)

# Print the results
print(word_counts.collect())

[('1', 1), ('Alice', 1), ('Sales', 3), ('5000', 1), ('200', 1), ('2', 1), ('Bob', 1), ('Finance', 3), ('6000', 1), ('300', 1), ('3', 1), ('Charlie', 1), ('5500', 1), ('250', 1), ('4', 1), ('David', 1), ('7000', 1), ('400', 1), ('5', 1), ('Eva', 1), ('HR', 2), ('4500', 1), ('150', 1), ('6', 1), ('Frank', 1), ('4800', 1), ('NULL', 1), ('7', 1), ('Grace', 1), ('5200', 1), ('220', 1), ('8', 1), ('Hank', 1), ('6500', 1), ('350', 1)]


After Optmization

In [6]:
# Read a csv file
df = spark.read.csv("Employee.csv", header=True, inferSchema=True)
# Repartition for better parallelism
rdd = rdd.repartition(100)

# Count word occurrences efficiently
word_counts = rdd.flatMap(lambda line: line.split(" ")) \
                 .map(lambda word: (word, 1)) \
                 .reduceByKey(lambda x, y: x + y)  # Use reduceByKey instead of groupByKey

# Cache the result
word_counts.cache()

# Print the results
word_counts.collect()

[('Sales', 3),
 ('4', 1),
 ('Bob', 1),
 ('7', 1),
 ('Frank', 1),
 ('David', 1),
 ('Alice', 1),
 ('150', 1),
 ('6000', 1),
 ('6', 1),
 ('6500', 1),
 ('300', 1),
 ('4500', 1),
 ('4800', 1),
 ('7000', 1),
 ('Finance', 3),
 ('NULL', 1),
 ('200', 1),
 ('5200', 1),
 ('400', 1),
 ('350', 1),
 ('8', 1),
 ('HR', 2),
 ('Eva', 1),
 ('Hank', 1),
 ('Charlie', 1),
 ('5000', 1),
 ('1', 1),
 ('2', 1),
 ('250', 1),
 ('Grace', 1),
 ('3', 1),
 ('5', 1),
 ('5500', 1),
 ('220', 1)]

In [7]:
word_counts.collect()

[('Sales', 3),
 ('4', 1),
 ('Bob', 1),
 ('7', 1),
 ('Frank', 1),
 ('David', 1),
 ('Alice', 1),
 ('150', 1),
 ('6000', 1),
 ('6', 1),
 ('6500', 1),
 ('300', 1),
 ('4500', 1),
 ('4800', 1),
 ('7000', 1),
 ('Finance', 3),
 ('NULL', 1),
 ('200', 1),
 ('5200', 1),
 ('400', 1),
 ('350', 1),
 ('8', 1),
 ('HR', 2),
 ('Eva', 1),
 ('Hank', 1),
 ('Charlie', 1),
 ('5000', 1),
 ('1', 1),
 ('2', 1),
 ('250', 1),
 ('Grace', 1),
 ('3', 1),
 ('5', 1),
 ('5500', 1),
 ('220', 1)]