Creating an RDD in Spark Core

In [3]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("SparkCoreExample").getOrCreate()

# Create an RDD
rdd = spark.sparkContext.parallelize(["Apache", "Spark", "Core", "Example"])

# Perform an action
result = rdd.collect()
print(result)

['Apache', 'Spark', 'Core', 'Example']


PySpark Example: Running SQL Queries in Spark

In [2]:
# Create a DataFrame
data = [(1, "John", 28), (2, "Mike", 35), (3, "Sara", 22)]
df = spark.createDataFrame(data, ["ID", "Name", "Age"])

# Register DataFrame as a temporary table
df.createOrReplaceTempView("people")

# Run an SQL query
result = spark.sql("SELECT * FROM people WHERE Age > 25")
result.show()

+---+----+---+
| ID|Name|Age|
+---+----+---+
|  1|John| 28|
|  2|Mike| 35|
+---+----+---+



PySpark Example: Real-time Data Processing

In [None]:
from pyspark.streaming import StreamingContext

# Initialize Streaming Context
ssc = StreamingContext(spark.sparkContext, batchDuration=5)

# Create a DStream (Simulating Real-time Data)
dstream = ssc.socketTextStream("localhost", 9999)

# Process Data
dstream.pprint()

# Start Streaming
ssc.start()
ssc.awaitTermination()



-------------------------------------------
Time: 2025-03-05 16:54:00
-------------------------------------------

-------------------------------------------
Time: 2025-03-05 16:54:05
-------------------------------------------

-------------------------------------------
Time: 2025-03-05 16:54:10
-------------------------------------------

-------------------------------------------
Time: 2025-03-05 16:54:15
-------------------------------------------

-------------------------------------------
Time: 2025-03-05 16:54:20
-------------------------------------------

-------------------------------------------
Time: 2025-03-05 16:54:25
-------------------------------------------

-------------------------------------------
Time: 2025-03-05 16:54:30
-------------------------------------------

-------------------------------------------
Time: 2025-03-05 16:54:35
-------------------------------------------

-------------------------------------------
Time: 2025-03-05 16:54:40
----------

PySpark Example: Logistic Regression with MLlib

In [3]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("SparkCoreExample").getOrCreate()

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

# Sample Data
data = [(1.0, 2.0, 3.0, 1), (2.0, 3.0, 4.0, 0), (3.0, 4.0, 5.0, 1)]
df = spark.createDataFrame(data, ["feature1", "feature2", "feature3", "label"])

# Assemble Features
assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3"], outputCol="features")
assembled_df = assembler.transform(df)

# Train Model
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(assembled_df)

# Print Model Coefficients
print(model.coefficients)

[0.0,0.0,0.0]


Creating a Simple Graph in GraphX

In [None]:
from pyspark import SparkContext
from graphframes import GraphFrame

from pyspark import SparkContext
# Initialize SparkContext



# Create vertices and edges
vertices = sc.parallelize([(1, "Alice"), (2, "Bob"), (3, "Charlie")])
edges = sc.parallelize([(1, 2), (2, 3), (3, 1)])

# Create a GraphFrame
graph = GraphFrame(vertices, edges)

# Run PageRank
results = graph.pageRank(resetProbability=0.15, maxIter=10)

# Show the results
results.vertices.show()

NameError: name 'sc' is not defined

In [6]:
from pyspark.sql import Row

# Install graphframes package
#%pip install graphframes

# Add graphframes package to Spark session
spark = SparkSession.builder \
	.appName("SparkCoreExample") \
	.config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
	.getOrCreate()

from graphframes import GraphFrame

# Define Vertices and Edges
vertices = spark.createDataFrame([("1", "Alice"), ("2", "Bob"), ("3", "Charlie")], ["id", "name"])
edges = spark.createDataFrame([("1", "2"), ("2", "3")], ["src", "dst"])

# Create Graph
graph = GraphFrame(vertices, edges)
graph.vertices.show()

Py4JJavaError: An error occurred while calling o262.loadClass.
: java.lang.ClassNotFoundException: org.graphframes.GraphFramePythonAPI
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:589)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
