In [None]:
!pip install pandas
!pip install pyspark
!pip install pyarrow

In [None]:
from pyspark.sql import SparkSession
 
# Building the SparkSession and name 
# it :'pandas to spark'
spark = SparkSession.builder.appName(
  "pandas to spark").getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [3]:
import pandas as pd

data = [[1, 1, '2019-08-01', '2019-08-02'], [2, 2, '2019-08-02', '2019-08-02'], [3, 1, '2019-08-11', '2019-08-12'], [4, 3, '2019-08-24', '2019-08-24'], [5, 3, '2019-08-21', '2019-08-22'], [6, 2, '2019-08-11', '2019-08-13'], [7, 4, '2019-08-09', '2019-08-09']]
delivery = pd.DataFrame(data, columns=['delivery_id', 'customer_id', 'order_date', 'customer_pref_delivery_date']).astype({'delivery_id':'Int64', 'customer_id':'Int64', 'order_date':'datetime64[ns]', 'customer_pref_delivery_date':'datetime64[ns]'})

In [4]:
delivery = spark.createDataFrame(delivery)
delivery.show()

+-----------+-----------+-------------------+---------------------------+
|delivery_id|customer_id|         order_date|customer_pref_delivery_date|
+-----------+-----------+-------------------+---------------------------+
|          1|          1|2019-08-01 00:00:00|        2019-08-02 00:00:00|
|          2|          2|2019-08-02 00:00:00|        2019-08-02 00:00:00|
|          3|          1|2019-08-11 00:00:00|        2019-08-12 00:00:00|
|          4|          3|2019-08-24 00:00:00|        2019-08-24 00:00:00|
|          5|          3|2019-08-21 00:00:00|        2019-08-22 00:00:00|
|          6|          2|2019-08-11 00:00:00|        2019-08-13 00:00:00|
|          7|          4|2019-08-09 00:00:00|        2019-08-09 00:00:00|
+-----------+-----------+-------------------+---------------------------+



In [10]:
from pyspark.sql.functions import when, count, sum, round, col, row_number
from pyspark.sql import Window

delivery \
.withColumn('state', when(col('order_date') == col('customer_pref_delivery_date'), 1).otherwise(0)) \
.withColumn('row_number', 
            row_number() \
                    .over(Window.partitionBy('customer_id') \
                                .orderBy(['customer_id', 'order_date']))) \
.where('row_number == 1') \
.groupby('row_number') \
.agg(sum('state').alias('no_of_immediate_on_first'),
     count('state').alias('total_no_of_immediate')) \
.withColumn('immediate_percentage', 
            round(col('no_of_immediate_on_first') / col('total_no_of_immediate') * 100, 2)) \
.select('immediate_percentage') \
.show()

+--------------------+
|immediate_percentage|
+--------------------+
|                50.0|
+--------------------+

