In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark SQL Example").getOrCreate()

# Create a DataFrame
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# Register the DataFrame as a temporary table
df.createOrReplaceTempView("people")

# Show the table
spark.sql("SELECT * FROM people").show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [5]:
# Run a SQL query to filter data
result = spark.sql("SELECT * FROM people WHERE Age > 30")

# Show the result
result.show()

+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 35|
+-------+---+



In [6]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("MLlib Example").getOrCreate()

# Create a DataFrame
data = [(Vectors.dense([1.0]), 2.0),
        (Vectors.dense([2.0]), 4.0),
        (Vectors.dense([3.0]), 6.0)]
columns = ["features", "label"]
df = spark.createDataFrame(data, columns)

# Create a linear regression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
model = lr.fit(df)

# Print the coefficients
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

Coefficients: [1.6455980960121552]
Intercept: 0.7088038079756894


In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Initialize SparkContext and StreamingContext
sc = SparkContext("local[2]", "PySpark Streaming Example")
ssc = StreamingContext(sc, 5)  # 5-second batch interval

# Create a DStream from a TCP source
lines = ssc.socketTextStream("localhost", 9999)

# Process the stream
words = lines.flatMap(lambda line: line.split(" "))
word_counts = words.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)

# Print the word counts
word_counts.pprint()

# Start the streaming context
ssc.start()

# Wait for the streaming context to terminate
ssc.awaitTermination()



-------------------------------------------
Time: 2025-03-09 07:57:20
-------------------------------------------

-------------------------------------------
Time: 2025-03-09 07:57:25
-------------------------------------------

-------------------------------------------
Time: 2025-03-09 07:57:30
-------------------------------------------

-------------------------------------------
Time: 2025-03-09 07:57:35
-------------------------------------------

-------------------------------------------
Time: 2025-03-09 07:57:40
-------------------------------------------

-------------------------------------------
Time: 2025-03-09 07:57:45
-------------------------------------------
('Hello', 2)
('world', 1)
('PySpark', 1)

-------------------------------------------
Time: 2025-03-09 07:57:50
-------------------------------------------

-------------------------------------------
Time: 2025-03-09 07:57:55
-------------------------------------------

---------------------------------------