In [69]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("PySpark-Practice").getOrCreate()

In [3]:
data = [("John", 28), ("Jane", 33), ("Mike", 45)]
cols = ["Name", "Age"]
df = spark.createDataFrame(data,cols)
df.show()

+----+---+
|Name|Age|
+----+---+
|John| 28|
|Jane| 33|
|Mike| 45|
+----+---+



In [5]:
df.select("Name").show()

+----+
|Name|
+----+
|John|
|Jane|
|Mike|
+----+



In [6]:
data = [(1,'Alice',3000),(2,'Bob',1500),(3,'Carol',4000)]
cols=['id','name','salary']
df = spark.createDataFrame(data,cols)

In [10]:
df.filter(col("salary")>3000).show()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  3|Carol|  4000|
+---+-----+------+



In [11]:
df.withColumn("Flag", lit("Y")).show()

+---+-----+------+----+
| id| name|salary|Flag|
+---+-----+------+----+
|  1|Alice|  3000|   Y|
|  2|  Bob|  1500|   Y|
|  3|Carol|  4000|   Y|
+---+-----+------+----+



In [12]:
data=[('US',100),('IN',200)]
cols=['country','count']
df = spark.createDataFrame(data,cols)

In [13]:
df.withColumnRenamed("count","total").show()

+-------+-----+
|country|total|
+-------+-----+
|     US|  100|
|     IN|  200|
+-------+-----+



In [26]:
data=[('a',1),('b',2),('a',1)]
cols=['k','v']
df = spark.createDataFrame(data,cols)
df.dropDuplicates().show()

+---+---+
|  k|  v|
+---+---+
|  a|  1|
|  b|  2|
+---+---+



In [27]:
data=[(1,),(2,),(3,)]
cols=['num']
df = spark.createDataFrame(data,cols)
df.count()

3

In [31]:
data=[('x',),( 'y',),('x',)]
cols=['val']
df = spark.createDataFrame(data,cols)
df.show()
df.select("val").distinct().show()

+---+
|val|
+---+
|  x|
|  y|
|  x|
+---+

+---+
|val|
+---+
|  x|
|  y|
+---+



In [33]:
data=[('p',10),('q',20),('p',5)]
cols=['item','qty']
df = spark.createDataFrame(data,cols)
df.groupBy(df.item).agg(sum(df.qty).alias("qty_sum")).show()

+----+-------+
|item|qty_sum|
+----+-------+
|   p|     15|
|   q|     20|
+----+-------+



In [40]:
data=[('a',1),('b',2)]
cols=['k','v']
df = spark.createDataFrame(data,cols)

In [41]:
df.toPandas()

Unnamed: 0,k,v
0,a,1
1,b,2


In [43]:
data=[('p',3),('q',1),('r',2)]
cols=['id','score']
df = spark.createDataFrame(data,cols)
df.orderBy(col('score').desc()).show()

+---+-----+
| id|score|
+---+-----+
|  p|    3|
|  r|    2|
|  q|    1|
+---+-----+



In [44]:
left = [(1,'Alice'),(2,'Bob')]
right = [(1,100),(3,300)]
Lcols=['id','name']
Rcols=['id','score']
left_df = spark.createDataFrame(left, Lcols)
right_df = spark.createDataFrame(right, Rcols)

In [46]:
left_df.join(right_df, "id", 'inner').show()

+---+-----+-----+
| id| name|score|
+---+-----+-----+
|  1|Alice|  100|
+---+-----+-----+



In [47]:
employees=[(1,'A'),(2,'B')]
salaries=[(1,5000)]
emps=spark.createDataFrame(employees,['id','name'])
sal=spark.createDataFrame(salaries,['id','salary'])

In [49]:
emps.join(sal, "id", "left").na.fill({"salary":3000}).show()

+---+----+------+
| id|name|salary|
+---+----+------+
|  1|   A|  5000|
|  2|   B|  3000|
+---+----+------+



In [50]:
data=[(1,['a','b']),(2,['c'])]
cols=['id','tags']
df=spark.createDataFrame(data,cols)

In [52]:
df.select("id", explode('tags').alias("expd_tag")).show()

+---+--------+
| id|expd_tag|
+---+--------+
|  1|       a|
|  1|       b|
|  2|       c|
+---+--------+



In [58]:
data=[('John|Doe|30',),(('Jane|Smith|25',))]
cols=['raw']
df=spark.createDataFrame([('John|Doe|30',),('Jane|Smith|25',)],['raw'])

In [59]:
df.show()

+-------------+
|          raw|
+-------------+
|  John|Doe|30|
|Jane|Smith|25|
+-------------+



In [66]:
df2 = df.withColumn("first", split(df['raw'], '\\|').getItem(0)) \
        .withColumn("last", split(df['raw'], '\\|').getItem(1)) \
        .withColumn("age", split(df['raw'], '\\|').getItem(2))

df2.show()


+-------------+-----+-----+---+
|          raw|first| last|age|
+-------------+-----+-----+---+
|  John|Doe|30| John|  Doe| 30|
|Jane|Smith|25| Jane|Smith| 25|
+-------------+-----+-----+---+



In [67]:
data=[('A',100),('A',50),('B',200)]
cols=['grp','val']
df=spark.createDataFrame(data,cols)

In [71]:
window = Window.partitionBy(df.grp).orderBy(df.val.desc())
df.withColumn("row_number", row_number().over(window)).show()

+---+---+----------+
|grp|val|row_number|
+---+---+----------+
|  A|100|         1|
|  A| 50|         2|
|  B|200|         1|
+---+---+----------+



In [72]:
items=[(1,'apple'),(2,'banana')]
sales=[(1,5),(2,3),(1,2)]
items_df=spark.createDataFrame(items,['id','item'])
sales_df=spark.createDataFrame(sales,['id','qty'])

In [73]:
items_df.join(broadcast(sales_df), "id", "inner").show()

+---+------+---+
| id|  item|qty|
+---+------+---+
|  1| apple|  2|
|  1| apple|  5|
|  2|banana|  3|
+---+------+---+



In [74]:
data=[('x',1),('x',2),('y',3)]
cols=['k','v']
df=spark.createDataFrame(data,cols)
df.groupBy(df.k).agg(collect_list(df.v).alias("coll_list")).show()

+---+---------+
|  k|coll_list|
+---+---------+
|  x|   [1, 2]|
|  y|      [3]|
+---+---------+



In [75]:
data=[('2024-01','A',10),('2024-01','B',5),('2024-02','A',8)]
cols=['month','category','amt']
df=spark.createDataFrame(data,cols)
df.groupBy("month").pivot("category").agg(sum(df.amt)).show()

+-------+---+----+
|  month|  A|   B|
+-------+---+----+
|2024-02|  8|NULL|
|2024-01| 10|   5|
+-------+---+----+



In [76]:
data=[('{"a":1,"b":2}',)]
cols=['json_str']
df=spark.createDataFrame(data,cols)
schema = schema_of_json(df.select('json_str').first()[0])
df2 = df.select(from_json('json_str', schema).alias('j')).select('j.*')
df2.show()

+---+---+
|  a|  b|
+---+---+
|  1|  2|
+---+---+



In [77]:
a=[(1,'x'),(2,'y')]
b=[(3,'z')]
cols=['id','k']
df1=spark.createDataFrame(a,cols)
df2=spark.createDataFrame(b,cols)
df1.union(df2).show()

+---+---+
| id|  k|
+---+---+
|  1|  x|
|  2|  y|
|  3|  z|
+---+---+

