In [1]:
from pyspark.sql import SparkSession

# Create Spark session connected to the cluster
spark = SparkSession.builder \
    .appName("Test Spark Cluster") \
    .master("spark://node-2:7077") \
    .getOrCreate()

# Verify connection
print(f"Spark Version: {spark.version}")
print(f"Master: {spark.sparkContext.master}")
print(f"App Name: {spark.sparkContext.appName}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/20 16:10:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Version: 4.1.0
Master: spark://node-2:7077
App Name: Test Spark Cluster


In [2]:
# Test basic Spark functionality
# Create a simple DataFrame
data = [
    ("Alice", 34, "Data Engineer"),
    ("Bob", 45, "Data Scientist"),
    ("Charlie", 29, "Developer"),
    ("Diana", 31, "Analyst")
]

df = spark.createDataFrame(data, ["name", "age", "role"])

# Show the DataFrame
print("Original DataFrame:")
df.show()

# Test transformations
print("\nFiltered DataFrame (age > 30):")
df.filter(df.age > 30).show()

# Test aggregations
print("\nAverage age:")
df.agg({"age": "avg"}).show()

# Test groupBy
print("\nCount by role:")
df.groupBy("role").count().show()

print("\n✓ Spark cluster is working correctly!")


Original DataFrame:


                                                                                

+-------+---+--------------+
|   name|age|          role|
+-------+---+--------------+
|  Alice| 34| Data Engineer|
|    Bob| 45|Data Scientist|
|Charlie| 29|     Developer|
|  Diana| 31|       Analyst|
+-------+---+--------------+


Filtered DataFrame (age > 30):
+-----+---+--------------+
| name|age|          role|
+-----+---+--------------+
|Alice| 34| Data Engineer|
|  Bob| 45|Data Scientist|
|Diana| 31|       Analyst|
+-----+---+--------------+


Average age:


                                                                                

+--------+
|avg(age)|
+--------+
|   34.75|
+--------+


Count by role:


[Stage 7:>                                                          (0 + 2) / 2]

+--------------+-----+
|          role|count|
+--------------+-----+
|Data Scientist|    1|
| Data Engineer|    1|
|     Developer|    1|
|       Analyst|    1|
+--------------+-----+


✓ Spark cluster is working correctly!


                                                                                