In [1]:
import sys
import random
import csv

In [2]:
def get_random_choice(lst):
    return random.choice(lst)

In [3]:
states = ["CA", "WA", "TX", "NV", "CO", "OR", "AZ", "WY", "NM", "UT"]
colors = ["Brown", "Blue", "Orange", "Yellow", "Green", "Red"]
fieldnames = ['State', 'Color', 'Count']


entries = 10000
dataset_fn = "mnm_dataset.csv"

with open(dataset_fn, mode='w') as dataset_file:
    dataset_writer = csv.writer(dataset_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    dataset_writer.writerow(fieldnames)
    for i in range(1, entries):
        dataset_writer.writerow([get_random_choice(states), get_random_choice(colors), random.randint(10, 100)])
print("Wrote %d lines in %s file" % (entries, dataset_fn))

Wrote 10000 lines in mnm_dataset.csv file


In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = (SparkSession
    .builder
    .appName("PythonMnMCount")
    .getOrCreate())


# read the file into a Spark DataFrame
mnm_df = (spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(dataset_fn))

mnm_df.show(n=5, truncate=False)

# aggregate count of all colors and groupBy state and color
# orderBy descending order
count_mnm_df = (mnm_df.select("State", "Color", "Count")
                .groupBy(["State", "Color"])
                .sum("Count")
                .orderBy("sum(Count)", ascending=False))

# show all the resulting aggregation for all the dates and colors
count_mnm_df.show(n=60, truncate=False)
print("Total Rows = %d" % (count_mnm_df.count()))

# find the aggregate count for California by filtering
ca_count_mnm_df = (mnm_df.select("*")
                   .where(mnm_df.State == 'CA')
                   .groupBy("State", "Color")
                   .sum("Count")
                   .orderBy("sum(Count)", ascending=False))

# show the resulting aggregation for California
ca_count_mnm_df.show(n=10, truncate=False)

22/11/25 15:15:49 WARN Utils: Your hostname, Pauls-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.4.90 instead (on interface en0)
22/11/25 15:15:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/25 15:15:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+-----+------+-----+
|State|Color |Count|
+-----+------+-----+
|AZ   |Red   |64   |
|WY   |Red   |79   |
|CO   |Yellow|91   |
|NV   |Brown |76   |
|CO   |Yellow|91   |
+-----+------+-----+
only showing top 5 rows

+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|WY   |Blue  |11612     |
|CO   |Blue  |11226     |
|NM   |Yellow|11028     |
|OR   |Orange|10853     |
|UT   |Green |10835     |
|UT   |Red   |10830     |
|AZ   |Brown |10395     |
|NM   |Green |10215     |
|OR   |Blue  |10187     |
|CO   |Orange|10147     |
|NV   |Orange|10107     |
|WY   |Brown |10086     |
|NV   |Red   |10066     |
|WA   |Yellow|9990      |
|NM   |Orange|9985      |
|NM   |Blue  |9831      |
|AZ   |Red   |9811      |
|TX   |Yellow|9750      |
|WY   |Yellow|9691      |
|WY   |Orange|9682      |
|WA   |Orange|9679      |
|CO   |Green |9657 