In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.getOrCreate()

2023-04-06 00:45:10,212 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [15]:
df = spark.read \
     .format("csv") \
     .option("header", True) \
     .load("file:///home/train/datasets/order_status.csv/")

In [16]:
df.limit(5).toPandas()

Unnamed: 0,ORDER_ID,SUBSCRIBER_ID,STATUS_DATE,STATUS_TIME,STATUS
0,100159,200427,20230223,83209,ASSIGNED
1,100159,200427,20230223,74232,RETURNED
2,100159,200427,20230222,95056,CREATED
3,100410,200366,20230223,91017,ASSIGNED
4,100410,200366,20230223,30301,RETURNED


In [17]:
df.printSchema()

root
 |-- ORDER_ID: string (nullable = true)
 |-- SUBSCRIBER_ID: string (nullable = true)
 |-- STATUS_DATE: string (nullable = true)
 |-- STATUS_TIME: string (nullable = true)
 |-- STATUS: string (nullable = true)



In [18]:
df.select('STATUS').distinct().collect()

                                                                                

[Row(STATUS='CREATED'),
 Row(STATUS='ASSIGNED'),
 Row(STATUS='RETURNED'),
 Row(STATUS='CANCELLED'),
 Row(STATUS='COMPLETED'),
 Row(STATUS='POOL')]

In [20]:
df_time_format = df.withColumn("TIME2", F.concat(F.col("STATUS_DATE"), F.lit(" "), F.col("STATUS_TIME"))) \
                   .withColumn("TIME1", F.to_timestamp(F.col("TIME2"), "yyyyMMdd Hmmss"))

df_time_format.limit(5).toPandas()

Unnamed: 0,ORDER_ID,SUBSCRIBER_ID,STATUS_DATE,STATUS_TIME,STATUS,TIME2,TIME1
0,100159,200427,20230223,83209,ASSIGNED,20230223 83209,2023-02-23 08:32:09
1,100159,200427,20230223,74232,RETURNED,20230223 74232,2023-02-23 07:42:32
2,100159,200427,20230222,95056,CREATED,20230222 95056,2023-02-22 09:50:56
3,100410,200366,20230223,91017,ASSIGNED,20230223 91017,2023-02-23 09:10:17
4,100410,200366,20230223,30301,RETURNED,20230223 30301,2023-02-23 03:03:01


In [21]:
df2 = df_time_format .drop("TIME2")

df2.limit(5).toPandas()

Unnamed: 0,ORDER_ID,SUBSCRIBER_ID,STATUS_DATE,STATUS_TIME,STATUS,TIME1
0,100159,200427,20230223,83209,ASSIGNED,2023-02-23 08:32:09
1,100159,200427,20230223,74232,RETURNED,2023-02-23 07:42:32
2,100159,200427,20230222,95056,CREATED,2023-02-22 09:50:56
3,100410,200366,20230223,91017,ASSIGNED,2023-02-23 09:10:17
4,100410,200366,20230223,30301,RETURNED,2023-02-23 03:03:01


In [22]:
# I noticed there ara same number, 100159 , more than one in order_ıd.

# I need to find out how many numbers are to the same number in order_id


In [24]:
df3 = df2.withColumn("min_time", F.min("TIME1").over(Window.partitionBy("ORDER_ID")))  \
         .withColumn("max_time", F.max("TIME1").over(Window.partitionBy("ORDER_ID"))) \
         .withColumn("rep_num", F.row_number().over(Window.partitionBy("ORDER_ID").orderBy(F.desc("TIME1")))) \
         .orderBy("ORDER_ID", "rep_num")

df3.limit(5).toPandas()

                                                                                

Unnamed: 0,ORDER_ID,SUBSCRIBER_ID,STATUS_DATE,STATUS_TIME,STATUS,TIME1,min_time,max_time,rep_num
0,100001,200574,20230223,81438,ASSIGNED,2023-02-23 08:14:38,2023-02-22 09:53:06,2023-02-23 08:14:38,1
1,100001,200574,20230223,74734,RETURNED,2023-02-23 07:47:34,2023-02-22 09:53:06,2023-02-23 08:14:38,2
2,100001,200574,20230222,95306,CREATED,2023-02-22 09:53:06,2023-02-22 09:53:06,2023-02-23 08:14:38,3
3,100002,200121,20230222,93712,POOL,2023-02-22 09:37:12,2023-02-22 09:37:12,2023-02-22 09:37:12,1
4,100002,200121,20230222,93712,CREATED,2023-02-22 09:37:12,2023-02-22 09:37:12,2023-02-22 09:37:12,2


In [25]:
# I will add min_time to in start_date in all of order_ıd

# I will add max_time to end_date whose status is COMPLETE", "CANCELLED"

# I should add only whose rep_num is 1

In [26]:
df4 = df3.withColumn("START_DATE", F.col("min_time")) \
         .withColumn("END_DATE", F.when(F.col("STATUS").isin("COMPLETED", "CANCELLED"), 
                               F.col("max_time"))).where("rep_num = 1")


df4.limit(20).toPandas()

                                                                                

Unnamed: 0,ORDER_ID,SUBSCRIBER_ID,STATUS_DATE,STATUS_TIME,STATUS,TIME1,min_time,max_time,rep_num,START_DATE,END_DATE
0,100001,200574,20230223,81438,ASSIGNED,2023-02-23 08:14:38,2023-02-22 09:53:06,2023-02-23 08:14:38,1,2023-02-22 09:53:06,NaT
1,100002,200121,20230222,93712,POOL,2023-02-22 09:37:12,2023-02-22 09:37:12,2023-02-22 09:37:12,1,2023-02-22 09:37:12,NaT
2,100003,200432,20230223,82249,ASSIGNED,2023-02-23 08:22:49,2023-02-22 09:40:01,2023-02-23 08:22:49,1,2023-02-22 09:40:01,NaT
3,100004,200234,20230222,104426,COMPLETED,2023-02-22 10:44:26,2023-02-22 09:37:58,2023-02-22 10:44:26,1,2023-02-22 09:37:58,2023-02-22 10:44:26
4,100005,200546,20230222,94727,POOL,2023-02-22 09:47:27,2023-02-22 09:47:27,2023-02-22 09:47:27,1,2023-02-22 09:47:27,NaT
5,100006,200369,20230223,91333,ASSIGNED,2023-02-23 09:13:33,2023-02-22 10:54:13,2023-02-23 09:13:33,1,2023-02-22 10:54:13,NaT
6,100007,200486,20230223,91359,ASSIGNED,2023-02-23 09:13:59,2023-02-22 10:54:48,2023-02-23 09:13:59,1,2023-02-22 10:54:48,NaT
7,100008,200190,20230223,83004,ASSIGNED,2023-02-23 08:30:04,2023-02-22 09:43:31,2023-02-23 08:30:04,1,2023-02-22 09:43:31,NaT
8,100009,200058,20230222,195548,COMPLETED,2023-02-22 19:55:48,2023-02-22 09:49:51,2023-02-22 19:55:48,1,2023-02-22 09:49:51,2023-02-22 19:55:48
9,100010,200253,20230222,120725,COMPLETED,2023-02-22 12:07:25,2023-02-22 09:43:01,2023-02-22 12:07:25,1,2023-02-22 09:43:01,2023-02-22 12:07:25


In [30]:
df5 = df4.withColumn("DURATION", F.when(F.col("END_DATE").isNull(), 0) \
                        .otherwise(  (F.col("END_DATE").cast(LongType()) - F.col("START_DATE").cast(LongType())) / 3600  )) \
                        .drop("STATUS_DATE", "STATUS_TIME", "TIME1", "min_time", "max_time", "rep_num")

df5.limit(5).toPandas()

                                                                                

Unnamed: 0,ORDER_ID,SUBSCRIBER_ID,STATUS,START_DATE,END_DATE,DURATION
0,100001,200574,ASSIGNED,2023-02-22 09:53:06,NaT,0.0
1,100002,200121,POOL,2023-02-22 09:37:12,NaT,0.0
2,100003,200432,ASSIGNED,2023-02-22 09:40:01,NaT,0.0
3,100004,200234,COMPLETED,2023-02-22 09:37:58,2023-02-22 10:44:26,1.107778
4,100005,200546,POOL,2023-02-22 09:47:27,NaT,0.0


In [31]:
df5.printSchema()

root
 |-- ORDER_ID: string (nullable = true)
 |-- SUBSCRIBER_ID: string (nullable = true)
 |-- STATUS: string (nullable = true)
 |-- START_DATE: timestamp (nullable = true)
 |-- END_DATE: timestamp (nullable = true)
 |-- DURATION: double (nullable = true)

