<a href="https://colab.research.google.com/github/mayureshpawashe/spark/blob/main/spark_practise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

##Checking Spark Installation & Creating SparkSession

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkArchitecture").getOrCreate()
print("Spark Version:", spark.version)

Spark Version: 3.5.5


##Creating SparkSession (Driver Side)

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DriverExample").getOrCreate()

print("Driver is running and managing tasks.")

Driver is running and managing tasks.


##Running tasks on Executors

In [None]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

rdd = sc.parallelize([1, 2, 3, 4, 5])  # RDD created
squared_rdd = rdd.map(lambda x: x**2)  # Tasks assigned to Executors

print("RDD processed by Executors:", squared_rdd.collect())  # Fetch results

RDD processed by Executors: [1, 4, 9, 16, 25]


##Demonstrating Parallel Execution

In [None]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

rdd = sc.parallelize(range(1, 11), numSlices=2)  # Data is split into 2 partitions
tasks = rdd.map(lambda x: (x, x**2))  # Each partition is processed in parallel

print("Tasks executed on Executors:", tasks.collect())


Tasks executed on Executors: [(1, 1), (2, 4), (3, 9), (4, 16), (5, 25), (6, 36), (7, 49), (8, 64), (9, 81), (10, 100)]


##Running Spark in Local Mode

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("LocalMode").getOrCreate()
print("Spark is running in Local Mode")


Spark is running in Local Mode


##Creating and Processing an RDD

In [None]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

# Creating an RDD from a Python list
rdd = sc.parallelize([1, 2, 3, 4, 5, 5])

squared_rdd = rdd.map(lambda x: x ** 2)  # Squaring each element
filtered_rdd = rdd.filter(lambda x: x % 2 == 0)  # Filtering even numbers
mapped_rdd = rdd.map(lambda x: (x, x ** 3))  # Creating key-value pairs (x, x^3)
reduced_value = rdd.reduce(lambda x, y: x + y)  # Summing all elements
distinct_rdd = rdd.flatMap(lambda x: (x, x)).distinct()  # Duplicating and removing duplicates

print("RDD elements squared:", squared_rdd.collect())
print("Filtered (even numbers):", filtered_rdd.collect())
print("Mapped (x, x^3):", mapped_rdd.collect())
print("Sum of elements (reduce):", reduced_value)
print("Distinct elements (after flatMap and distinct):", distinct_rdd.collect())


#more methods
count = rdd.count()  # Counting elements in the RDD
first_element = rdd.first()  # Getting the first element
rdd_sum = rdd.sum()  # Computing the sum of all elements
rdd_max = rdd.max()  # Finding the max element
rdd_min = rdd.min()  # Finding the min element

print("Count of elements:", count)
print("First element:", first_element)
print("Sum of RDD elements:", rdd_sum)
print("Max element:", rdd_max)
print("Min element:", rdd_min)


RDD elements squared: [1, 4, 9, 16, 25, 25]
Filtered (even numbers): [2, 4]
Mapped (x, x^3): [(1, 1), (2, 8), (3, 27), (4, 64), (5, 125), (5, 125)]
Sum of elements (reduce): 20
Distinct elements (after flatMap and distinct): [2, 4, 1, 3, 5]
Count of elements: 6
First element: 1
Sum of RDD elements: 20
Max element: 5
Min element: 1


##Creating and Displaying a DataFrame

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataFrameExample").getOrCreate()

data = [("Mayuresh", 15), ("Onkar", 30), ("Rohit", 55)]
columns = ["Name", "Age"]

df = spark.createDataFrame(data, columns)

df.show()


+--------+---+
|    Name|Age|
+--------+---+
|Mayuresh| 15|
|   Onkar| 30|
|   Rohit| 55|
+--------+---+



##Basic DataFrame Operations

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

data = [
    (1, "Mayuresh", 25, "Engineer"),
    (2, "Onkar", 30, "Doctor"),
    (3, "Rohit", 35, "Teacher"),
    (4, "Arya", 28, "Artist")
]

# Define schema
columns = ["ID", "Name", "Age", "Profession"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show DataFrame
df.show()


+---+--------+---+----------+
| ID|    Name|Age|Profession|
+---+--------+---+----------+
|  1|Mayuresh| 25|  Engineer|
|  2|   Onkar| 30|    Doctor|
|  3|   Rohit| 35|   Teacher|
|  4|    Arya| 28|    Artist|
+---+--------+---+----------+



In [None]:
df.printSchema()

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Profession: string (nullable = true)



In [None]:
df.select("Name", "Age").show()

+--------+---+
|    Name|Age|
+--------+---+
|Mayuresh| 25|
|   Onkar| 30|
|   Rohit| 35|
|    Arya| 28|
+--------+---+



In [None]:
df.filter(df.Age > 28).show()

+---+-----+---+----------+
| ID| Name|Age|Profession|
+---+-----+---+----------+
|  2|Onkar| 30|    Doctor|
|  3|Rohit| 35|   Teacher|
+---+-----+---+----------+



In [None]:
df.filter((df.Profession == "Doctor") | (df.Profession == "Teacher")).show()

+---+-----+---+----------+
| ID| Name|Age|Profession|
+---+-----+---+----------+
|  2|Onkar| 30|    Doctor|
|  3|Rohit| 35|   Teacher|
+---+-----+---+----------+



In [None]:
df.count()

4

In [None]:
df.groupBy("Profession").count().show()

+----------+-----+
|Profession|count|
+----------+-----+
|    Doctor|    1|
|  Engineer|    1|
|   Teacher|    1|
|    Artist|    1|
+----------+-----+



In [27]:
from pyspark.sql.functions import col

df = df.withColumn("AgeAfter5Years", col("Age") + 5)
df.show()


+---+--------+---+----------+--------------+
| ID|    Name|Age|Profession|AgeAfter5Years|
+---+--------+---+----------+--------------+
|  1|Mayuresh| 25|  Engineer|            30|
|  2|   Onkar| 30|    Doctor|            35|
|  3|   Rohit| 35|   Teacher|            40|
|  4|    Arya| 28|    Artist|            33|
+---+--------+---+----------+--------------+



In [28]:
df = df.withColumnRenamed("Profession", "Job")
df.show()


+---+--------+---+--------+--------------+
| ID|    Name|Age|     Job|AgeAfter5Years|
+---+--------+---+--------+--------------+
|  1|Mayuresh| 25|Engineer|            30|
|  2|   Onkar| 30|  Doctor|            35|
|  3|   Rohit| 35| Teacher|            40|
|  4|    Arya| 28|  Artist|            33|
+---+--------+---+--------+--------------+



In [29]:
df.orderBy("Age").show()


+---+--------+---+--------+--------------+
| ID|    Name|Age|     Job|AgeAfter5Years|
+---+--------+---+--------+--------------+
|  1|Mayuresh| 25|Engineer|            30|
|  4|    Arya| 28|  Artist|            33|
|  2|   Onkar| 30|  Doctor|            35|
|  3|   Rohit| 35| Teacher|            40|
+---+--------+---+--------+--------------+



In [30]:
df.orderBy(col("Age").desc()).show()


+---+--------+---+--------+--------------+
| ID|    Name|Age|     Job|AgeAfter5Years|
+---+--------+---+--------+--------------+
|  3|   Rohit| 35| Teacher|            40|
|  2|   Onkar| 30|  Doctor|            35|
|  4|    Arya| 28|  Artist|            33|
|  1|Mayuresh| 25|Engineer|            30|
+---+--------+---+--------+--------------+



In [31]:
df.createOrReplaceTempView("people")


In [32]:
result = spark.sql("SELECT Name, Age FROM people WHERE Age > 28")
result.show()


+-----+---+
| Name|Age|
+-----+---+
|Onkar| 30|
|Rohit| 35|
+-----+---+

