In [None]:
1. Remove duplicates based on a key column

In [None]:
winspec = Window.partitionBy("customer_id") 
df2 = df.withColumn("rn",row_number().over(winspec))
corrected_df = df2.where(F.col("rn") == 1) 

## Explode a nested column

In [1]:
data = {
  "user": 1,
  "events": [
    {"type": "click", "ts": 100},
    {"type": "purchase", "ts": 200}
  ]
}
print(data)

{'user': 1, 'events': [{'type': 'click', 'ts': 100}, {'type': 'purchase', 'ts': 200}]}


### When data is read from a dictionary to create a dataframe, it is advisable to create a pandas dataframe from the dictionary and then create a spark dataframe out of pandas data frame.

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

schema = StructType([
    StructField("user", IntegerType(), True),
    StructField(
        "events",
        ArrayType(
            StructType([
                StructField("type", StringType(), True),
                StructField("ts", IntegerType(), True)
            ])
        ),
        True
    )
])


In [5]:
from pyspark.sql import SparkSession 
spark = SparkSession.builder.appName("explode").getOrCreate()

25/11/20 21:09:07 WARN Utils: Your hostname, user-HP-Pavilion-x360-Convertible-14-dh0xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.24 instead (on interface wlo1)
25/11/20 21:09:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/20 21:09:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:

df = spark.createDataFrame(data, schema)
df.show()

In [2]:
import pandas as pd 


In [3]:
pdf = pd.DataFrame(data)
pdf

Unnamed: 0,user,events
0,1,"{'type': 'click', 'ts': 100}"
1,1,"{'type': 'purchase', 'ts': 200}"


In [13]:
df = spark.createDataFrame(pdf) 
df.show(truncate=False)

+----+-----------------------------+
|user|events                       |
+----+-----------------------------+
|1   |{type -> click, ts -> 100}   |
|1   |{type -> purchase, ts -> 200}|
+----+-----------------------------+



In [15]:
from pyspark.sql.functions import explode
df2 = df.select("user", explode("events").alias("type","ts")) 
df2.show()

+----+----+--------+
|user|type|      ts|
+----+----+--------+
|   1|type|   click|
|   1|  ts|     100|
|   1|type|purchase|
|   1|  ts|     200|
+----+----+--------+

