In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = (SparkSession.builder.appName("df_operations")
         .master("local[*]")
         .config("spark.executor.memory", "1g")
         .getOrCreate()
         )

In [4]:
# Sample Data
data = [
    (1, "Alice", 25),
    (2, "Bob", 30),
    (3, "Catherine", 35)
]

# Define Schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show Data
df.show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1|    Alice| 25|
|  2|      Bob| 30|
|  3|Catherine| 35|
+---+---------+---+



In [5]:
df.show(2)

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
|  2|  Bob| 30|
+---+-----+---+
only showing top 2 rows



In [6]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [7]:
df.columns

['id', 'name', 'age']

In [9]:
df.describe().show()

+-------+---+---------+----+
|summary| id|     name| age|
+-------+---+---------+----+
|  count|  3|        3|   3|
|   mean|2.0|     NULL|30.0|
| stddev|1.0|     NULL| 5.0|
|    min|  1|    Alice|  25|
|    max|  3|Catherine|  35|
+-------+---+---------+----+



# Select and Filtering Data

In [10]:
df.select('name', 'age').show()

+---------+---+
|     name|age|
+---------+---+
|    Alice| 25|
|      Bob| 30|
|Catherine| 35|
+---------+---+



In [11]:
df.filter(df.age > 25).show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  2|      Bob| 30|
|  3|Catherine| 35|
+---+---------+---+



In [12]:
df.where(df.name == 'Alice').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
+---+-----+---+



In [14]:
df.distinct().show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1|    Alice| 25|
|  2|      Bob| 30|
|  3|Catherine| 35|
+---+---------+---+



# Sorting and Ordering

In [15]:
df.orderBy('age').show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1|    Alice| 25|
|  2|      Bob| 30|
|  3|Catherine| 35|
+---+---------+---+



In [16]:
df.orderBy(df.age.desc()).show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  3|Catherine| 35|
|  2|      Bob| 30|
|  1|    Alice| 25|
+---+---------+---+



# Adding and Dropping Columns

In [19]:
df = df.withColumn('new_age', df.age + 10)
df.show()

+---+---------+---+-------+
| id|     name|age|new_age|
+---+---------+---+-------+
|  1|    Alice| 25|     35|
|  2|      Bob| 30|     40|
|  3|Catherine| 35|     45|
+---+---------+---+-------+



In [20]:
df = df.drop('new_age')
df.show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1|    Alice| 25|
|  2|      Bob| 30|
|  3|Catherine| 35|
+---+---------+---+



# Aggregation and Grouping

In [21]:
df.groupBy('name').count().show()

+---------+-----+
|     name|count|
+---------+-----+
|    Alice|    1|
|      Bob|    1|
|Catherine|    1|
+---------+-----+



In [22]:
df.agg({'age':'avg'}).show()

+--------+
|avg(age)|
+--------+
|    30.0|
+--------+



# Joins

In [24]:
data2 = [
    (1, 'vietnam'),
    (2, 'uk'),
    (3, 'usa')
]

schema2 = StructType([
    StructField("id", IntegerType(), True),
    StructField("country", StringType(), True)
])

In [25]:
df2 = spark.createDataFrame(data2, schema2)

In [26]:
df2.show()

+---+-------+
| id|country|
+---+-------+
|  1|vietnam|
|  2|     uk|
|  3|    usa|
+---+-------+



In [27]:
df.show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1|    Alice| 25|
|  2|      Bob| 30|
|  3|Catherine| 35|
+---+---------+---+



In [29]:
final_df = df.join(df2, 'id')
final_df.show()

+---+---------+---+-------+
| id|     name|age|country|
+---+---------+---+-------+
|  1|    Alice| 25|vietnam|
|  2|      Bob| 30|     uk|
|  3|Catherine| 35|    usa|
+---+---------+---+-------+

