In [None]:
!pip install pandas
!pip install pyspark
!pip install pyarrow

In [None]:
from pyspark.sql import SparkSession
 
# Building the SparkSession and name 
# it :'pandas to spark'
spark = SparkSession.builder.appName(
  "pandas to spark").getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [3]:
import pandas as pd

data = [[1, 20, '2019-08-14'], [2, 50, '2019-08-14'], [1, 30, '2019-08-15'], [1, 35, '2019-08-16'], [2, 65, '2019-08-17'], [3, 20, '2019-08-18']]
products = pd.DataFrame(data, columns=['product_id', 'new_price', 'change_date']).astype({'product_id':'Int64', 'new_price':'Int64', 'change_date':'datetime64[ns]'})

In [4]:
products = spark.createDataFrame(products)
products.show()

+----------+---------+-------------------+
|product_id|new_price|        change_date|
+----------+---------+-------------------+
|         1|       20|2019-08-14 00:00:00|
|         2|       50|2019-08-14 00:00:00|
|         1|       30|2019-08-15 00:00:00|
|         1|       35|2019-08-16 00:00:00|
|         2|       65|2019-08-17 00:00:00|
|         3|       20|2019-08-18 00:00:00|
+----------+---------+-------------------+



In [28]:
from pyspark.sql.functions import max, lit

products \
.where('change_date <= "2019-08-16"') \
.groupby('product_id') \
.agg(max('change_date').alias('change_date')) \
.join(
    products,
    ['product_id', 'change_date'],
    'inner'
) \
.withColumnRenamed('new_price', 'price') \
.select(['product_id', 'price']) \
.union(
    products \
    .where(f"""product_id not in {tuple(products \
                                .where('change_date <= "2019-08-16"') \
                                .toPandas()['product_id'] \
                                .to_list())}""") \
    .withColumn('price', lit(10)) \
    .select(['product_id', 'price'])
) \
.show()


+----------+-----+
|product_id|price|
+----------+-----+
|         2|   50|
|         1|   35|
|         3|   10|
+----------+-----+

