<a href="https://colab.research.google.com/github/nitiksha/PySpark_code_practice/blob/main/dataframe_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# Create Spark session
spark = SparkSession.builder.appName("DfFunctionsPractice").getOrCreate()

# Sample data
data = [
    (1, "Alice", 85.5, 10),
    (2, "Bob", 90.0, 20),
    (1, "Alice", 78.0, 15),
    (4, "Charlie", 92.5, 25)
]

columns = ["id", "name", "score", "bonus"]

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

# Show original df
df.show()

# Now you can use df to practice all these functions:
# Examples:


+---+-------+-----+-----+
| id|   name|score|bonus|
+---+-------+-----+-----+
|  1|  Alice| 85.5|   10|
|  2|    Bob| 90.0|   20|
|  1|  Alice| 78.0|   15|
|  4|Charlie| 92.5|   25|
+---+-------+-----+-----+



In [30]:
df.withColumn("flag",F.when((F.col("id")>2),"Yes").otherwise("No")).show()

+---+-------+-----+-----+----+
| id|   name|score|bonus|flag|
+---+-------+-----+-----+----+
|  1|  Alice| 85.5|   10|  No|
|  2|    Bob| 90.0|   20|  No|
|  1|  Alice| 78.0|   15|  No|
|  4|Charlie| 92.5|   25| Yes|
+---+-------+-----+-----+----+



In [31]:
df.withColumn("high_score",F.when((F.col("score")>=90),"90-percentile").otherwise("low-percentile")).show()

+---+-------+-----+-----+--------------+
| id|   name|score|bonus|    high_score|
+---+-------+-----+-----+--------------+
|  1|  Alice| 85.5|   10|low-percentile|
|  2|    Bob| 90.0|   20| 90-percentile|
|  1|  Alice| 78.0|   15|low-percentile|
|  4|Charlie| 92.5|   25| 90-percentile|
+---+-------+-----+-----+--------------+



In [32]:
df.filter(F.col("score").isNull()).show()
df.filter(F.col("score").isNotNull()).show()


+---+----+-----+-----+
| id|name|score|bonus|
+---+----+-----+-----+
+---+----+-----+-----+

+---+-------+-----+-----+
| id|   name|score|bonus|
+---+-------+-----+-----+
|  1|  Alice| 85.5|   10|
|  2|    Bob| 90.0|   20|
|  1|  Alice| 78.0|   15|
|  4|Charlie| 92.5|   25|
+---+-------+-----+-----+



In [33]:
df.filter(F.col("bonus").between(10,20)).show()
df.filter(F.col("name").like('Al%')).show()
df.filter(F.col("name").rlike('c')).show()
df.filter(F.col("name").rlike('(?i)C')).show()

+---+-----+-----+-----+
| id| name|score|bonus|
+---+-----+-----+-----+
|  1|Alice| 85.5|   10|
|  2|  Bob| 90.0|   20|
|  1|Alice| 78.0|   15|
+---+-----+-----+-----+

+---+-----+-----+-----+
| id| name|score|bonus|
+---+-----+-----+-----+
|  1|Alice| 85.5|   10|
|  1|Alice| 78.0|   15|
+---+-----+-----+-----+

+---+-----+-----+-----+
| id| name|score|bonus|
+---+-----+-----+-----+
|  1|Alice| 85.5|   10|
|  1|Alice| 78.0|   15|
+---+-----+-----+-----+

+---+-------+-----+-----+
| id|   name|score|bonus|
+---+-------+-----+-----+
|  1|  Alice| 85.5|   10|
|  1|  Alice| 78.0|   15|
|  4|Charlie| 92.5|   25|
+---+-------+-----+-----+



In [34]:
df1= df.withColumn("id_str",F.col("id").cast("string"))

In [35]:
df1.printSchema()


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- score: double (nullable = true)
 |-- bonus: long (nullable = true)
 |-- id_str: string (nullable = true)



In [36]:
df.select(F.count("id")).show()

+---------+
|count(id)|
+---------+
|        4|
+---------+



In [39]:
df.select(F.sum_distinct("id"),F.sum("id")).show()

+----------------+-------+
|sum(DISTINCT id)|sum(id)|
+----------------+-------+
|               7|      8|
+----------------+-------+



In [37]:
df.select(F.countDistinct("id"),F.approxCountDistinct("id")).show()

+------------------+-------------------------+
|count(DISTINCT id)|approx_count_distinct(id)|
+------------------+-------------------------+
|                 3|                        3|
+------------------+-------------------------+



In [4]:
df.select(F.min("id"),F.max("id"),F.avg("id"),F.first("id"),F.last("id")).show()

+-------+-------+-------+---------+--------+
|min(id)|max(id)|avg(id)|first(id)|last(id)|
+-------+-------+-------+---------+--------+
|      1|      4|    2.0|        1|       4|
+-------+-------+-------+---------+--------+



In [8]:
df.groupBy("name").agg(F.collect_list("id")).show()

+-------+----------------+
|   name|collect_list(id)|
+-------+----------------+
|    Bob|             [2]|
|  Alice|          [1, 1]|
|Charlie|             [4]|
+-------+----------------+



In [9]:
df.groupBy("name").agg(F.collect_set("id")).show()

+-------+---------------+
|   name|collect_set(id)|
+-------+---------------+
|    Bob|            [2]|
|  Alice|            [1]|
|Charlie|            [4]|
+-------+---------------+

