In [27]:
!pip install pyspark



In [28]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

# Create a SparkSession
spark = SparkSession.builder \
    .appName("SparkTutorial") \
    .getOrCreate()

In [29]:
# Create an RDD
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
rdd = spark.sparkContext.parallelize(data)

# Filter: Get only even numbers
even_rdd = rdd.filter(lambda x: x % 2 == 0)

# Filter: Get only odd numbers
odd_rdd = rdd.filter(lambda x: x % 2 != 0)

# Reduce: Sum all elements
sum_result = rdd.reduce(lambda a, b: a + b)

# Collect results
even_numbers = even_rdd.collect()
odd_numbers = odd_rdd.collect()
print(f'Data: {data}')
print("Even Numbers:", even_numbers)
print("Odd Numbers:", odd_numbers)
print("Sum of all numbers:", sum_result)


Data: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Even Numbers: [2, 4, 6, 8, 10]
Odd Numbers: [1, 3, 5, 7, 9]
Sum of all numbers: 55


In [30]:
# Create DataFrame from a list of tuples
data = [("Alice", 34), ("Bob", 45), ("Catherine", 29)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
df.show()

# Create DataFrame from a dictionary
data_dict = [{"Name": "Alice", "Age": 34}, {"Name": "Bob", "Age": 45}, {"Name": "Catherine", "Age": 29}]
df_dict = spark.createDataFrame(data_dict)
df_dict.show()

# Load DataFrame from a CSV file
# df_csv = spark.read.csv("file.csv", header=True, inferSchema=True)
# df_csv.show()


+---------+---+
|     Name|Age|
+---------+---+
|    Alice| 34|
|      Bob| 45|
|Catherine| 29|
+---------+---+

+---+---------+
|Age|     Name|
+---+---------+
| 34|    Alice|
| 45|      Bob|
| 29|Catherine|
+---+---------+



In [31]:
# Select specific columns
df.select("Name", "Age").show()

# Filter rows
df.filter(df.Age > 30).show()

# Group by and aggregate
df.groupBy("Age").count().show()

# Add new column
df.withColumn("Age in 5 years", df.Age + 5).show()


+---------+---+
|     Name|Age|
+---------+---+
|    Alice| 34|
|      Bob| 45|
|Catherine| 29|
+---------+---+

+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
+-----+---+

+---+-----+
|Age|count|
+---+-----+
| 34|    1|
| 29|    1|
| 45|    1|
+---+-----+

+---------+---+--------------+
|     Name|Age|Age in 5 years|
+---------+---+--------------+
|    Alice| 34|            39|
|      Bob| 45|            50|
|Catherine| 29|            34|
+---------+---+--------------+



In [32]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

# Run SQL queries
sql_df = spark.sql("SELECT Name, Age FROM people WHERE Age > 30")
sql_df.show()

# Join DataFrames using SQL
data2 = [("Alice", "HR"), ("Bob", "Engineering"), ("Catherine", "Marketing")]
columns2 = ["Name", "Department"]
df2 = spark.createDataFrame(data2, columns2)
df2.createOrReplaceTempView("departments")

join_df = spark.sql("SELECT p.Name, p.Age, d.Department FROM people p JOIN departments d ON p.Name = d.Name")
join_df.show()


+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
+-----+---+

+---------+---+-----------+
|     Name|Age| Department|
+---------+---+-----------+
|    Alice| 34|         HR|
|      Bob| 45|Engineering|
|Catherine| 29|  Marketing|
+---------+---+-----------+

