<a href="https://colab.research.google.com/github/nitiksha/PySpark_code_practice/blob/main/string_func.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import pyspark.sql.functions as F


# Create Spark session
spark = SparkSession.builder.appName("WindowFunctionPractice").getOrCreate()

# Sample data: id and name columns
data = [
    (1, "Alice"),
    (2, "Bob"),
    (3, "Alice"),
    (4, "Charlie")
]

columns = ["id", "name"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()


windowSpec = Window.orderBy('id')

df.withColumns({"rank":F.rank().over(windowSpec),"dense_rank":F.dense_rank().over(windowSpec),"row_number":F.row_number().over(windowSpec)}).show()




+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|  Alice|
|  4|Charlie|
+---+-------+

+---+-------+----+----------+----------+
| id|   name|rank|dense_rank|row_number|
+---+-------+----+----------+----------+
|  1|  Alice|   1|         1|         1|
|  2|    Bob|   2|         2|         2|
|  3|  Alice|   3|         3|         3|
|  4|Charlie|   4|         4|         4|
+---+-------+----+----------+----------+



In [9]:
df.withColumn('buckets',F.ntile(2).over(windowSpec)).show()
df.withColumn('ntile', F.ntile(2).over(windowSpec)).show()

+---+-------+-------+
| id|   name|buckets|
+---+-------+-------+
|  1|  Alice|      1|
|  2|    Bob|      1|
|  3|  Alice|      2|
|  4|Charlie|      2|
+---+-------+-------+

+---+-------+-----+
| id|   name|ntile|
+---+-------+-----+
|  1|  Alice|    1|
|  2|    Bob|    1|
|  3|  Alice|    2|
|  4|Charlie|    2|
+---+-------+-----+



In [11]:
df.withColumns({"lead":F.lead("id").over(windowSpec),"lag":F.lag("id").over(windowSpec),"percent_rank":F.percent_rank().over(windowSpec)}).show()



+---+-------+----+----+------------------+
| id|   name|lead| lag|      percent_rank|
+---+-------+----+----+------------------+
|  1|  Alice|   2|NULL|               0.0|
|  2|    Bob|   3|   1|0.3333333333333333|
|  3|  Alice|   4|   2|0.6666666666666666|
|  4|Charlie|NULL|   3|               1.0|
+---+-------+----+----+------------------+



In [22]:
df.withColumns({"full_name":F.concat("id",F.lit("doe")),"full_name_ws": F.concat_ws(" ","id",F.lit("doe")),"ltrim":F.trim(df["name"])}).show()

+---+-------+---------+------------+-------+
| id|   name|full_name|full_name_ws|  ltrim|
+---+-------+---------+------------+-------+
|  1|  Alice|     1doe|       1 doe|  Alice|
|  2|    Bob|     2doe|       2 doe|    Bob|
|  3|  Alice|     3doe|       3 doe|  Alice|
|  4|Charlie|     4doe|       4 doe|Charlie|
+---+-------+---------+------------+-------+



In [28]:
df.withColumn("substr",F.substring(F.col("name"),1,3)).show()

+---+-------+------+
| id|   name|substr|
+---+-------+------+
|  1|  Alice|   Ali|
|  2|    Bob|   Bob|
|  3|  Alice|   Ali|
|  4|Charlie|   Cha|
+---+-------+------+



In [38]:
df1=df.withColumn("substr",F.substring(F.col("name"),1,3))
df1.printSchema()
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- substr: string (nullable = true)

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [39]:
df1.withColumn("split", F.split(F.col("substr"), "i")).show(truncate=False)

+---+-------+------+------+
|id |name   |substr|split |
+---+-------+------+------+
|1  |Alice  |Ali   |[Al, ]|
|2  |Bob    |Bob   |[Bob] |
|3  |Alice  |Ali   |[Al, ]|
|4  |Charlie|Cha   |[Cha] |
+---+-------+------+------+

