# Spark Data Frame Operations

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Create Spark Session
spark = SparkSession.builder.appName("DataFrameOps").getOrCreate()

# Sample Data
data = [
    (1, "Alice", 25),
    (2, "Bob", 30),
    (3, "Charlie", 35)
]

# Define Schema
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("age", IntegerType(), False)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show Data
df.show()


25/02/04 09:22:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



                                                                                

In [2]:
df.show(2)

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
|  2|  Bob| 30|
+---+-----+---+
only showing top 2 rows



In [3]:
df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)



In [4]:
df.columns

['id', 'name', 'age']

In [5]:
df.describe().show()



+-------+---+-------+----+
|summary| id|   name| age|
+-------+---+-------+----+
|  count|  3|      3|   3|
|   mean|2.0|   null|30.0|
| stddev|1.0|   null| 5.0|
|    min|  1|  Alice|  25|
|    max|  3|Charlie|  35|
+-------+---+-------+----+



                                                                                

In [6]:
# Select and Filtering Data Use Cases

In [8]:
df.select('name','age').show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [10]:
df.filter(df.age>25).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [11]:
df.where(df.name=='Alice').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
+---+-----+---+



In [12]:
df.distinct().show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [None]:
# Sorting and Ordering

In [13]:
df.orderBy('age').show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [14]:
df.orderBy(df.age.desc()).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  2|    Bob| 30|
|  1|  Alice| 25|
+---+-------+---+



In [15]:
# adding and dropping coluns

In [16]:
df.withColumn('new_age',df.age+5).show()

+---+-------+---+-------+
| id|   name|age|new_age|
+---+-------+---+-------+
|  1|  Alice| 25|     30|
|  2|    Bob| 30|     35|
|  3|Charlie| 35|     40|
+---+-------+---+-------+



In [17]:
df.drop('age').show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+



In [18]:
# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show Data
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [19]:
# Aggregation and Grouping

In [20]:
df.groupBy('name').count().show()

+-------+-----+
|   name|count|
+-------+-----+
|  Alice|    1|
|Charlie|    1|
|    Bob|    1|
+-------+-----+



In [21]:
df.agg({'age':'avg'}).show()

+--------+
|avg(age)|
+--------+
|    30.0|
+--------+



In [22]:
data2 = [(1,'usa'),(2,'uk'),(3,'india')]
schema2 = StructType(
    [StructField('id',IntegerType(),True),
     StructField('country',StringType(),True)]
)

In [23]:
df2 = spark.createDataFrame(data2,schema2)

In [24]:
df2.show()

+---+-------+
| id|country|
+---+-------+
|  1|    usa|
|  2|     uk|
|  3|  india|
+---+-------+



In [25]:
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [35]:
final_df = df.join(df2,'id')

In [36]:
final_df.show()

+---+-------+---+-------+
| id|   name|age|country|
+---+-------+---+-------+
|  1|  Alice| 25|    usa|
|  2|    Bob| 30|     uk|
|  3|Charlie| 35|  india|
+---+-------+---+-------+



In [37]:
spark.stop()