### Inspecting Order Status RAW DATA

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', -1)

In [2]:
order_status = spark.read.json('s3n://ifood-data-architect-test-source/status.json.gz')
order_status.limit(5).toPandas().head()

Unnamed: 0,created_at,order_id,status_id,value
0,2019-01-25T01:05:07.000Z,0002fe02-d7dc-4232-b7ac-3394019ce240,b4298862-fa38-499a-93e2-a76930fb2bce,CONCLUDED
1,2019-01-24T23:04:27.000Z,0002fe02-d7dc-4232-b7ac-3394019ce240,7964bf63-007a-484d-a321-e9118ccc2f97,REGISTERED
2,2019-01-24T23:04:28.000Z,0002fe02-d7dc-4232-b7ac-3394019ce240,ca16b92b-db8f-4274-b165-929675541a9f,PLACED
3,2019-01-18T00:45:02.000Z,000cef8c-83c7-49eb-a0fb-404e6dc2150e,bf43cc29-c3c1-4f3a-9a6c-deb902ca286c,CONCLUDED
4,2019-01-18T00:45:02.000Z,000cef8c-83c7-49eb-a0fb-404e6dc2150e,bf43cc29-c3c1-4f3a-9a6c-deb902ca286c,CONCLUDED


In [3]:
order_status.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- status_id: string (nullable = true)
 |-- value: string (nullable = true)



### Setting Schema

In [4]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType, TimestampType, BooleanType, ArrayType

schema = StructType([
    StructField("created_at", TimestampType()),
    StructField("order_id", StringType()),
    StructField("status_id", StringType()),
    StructField("value", StringType())
])

order_status = spark.read.json('s3n://ifood-data-architect-test-source/status.json.gz', schema=schema)
order_status.limit(5).toPandas().head()

Unnamed: 0,created_at,order_id,status_id,value
0,2019-01-25 01:05:07,0002fe02-d7dc-4232-b7ac-3394019ce240,b4298862-fa38-499a-93e2-a76930fb2bce,CONCLUDED
1,2019-01-24 23:04:27,0002fe02-d7dc-4232-b7ac-3394019ce240,7964bf63-007a-484d-a321-e9118ccc2f97,REGISTERED
2,2019-01-24 23:04:28,0002fe02-d7dc-4232-b7ac-3394019ce240,ca16b92b-db8f-4274-b165-929675541a9f,PLACED
3,2019-01-18 00:45:02,000cef8c-83c7-49eb-a0fb-404e6dc2150e,bf43cc29-c3c1-4f3a-9a6c-deb902ca286c,CONCLUDED
4,2019-01-18 00:45:02,000cef8c-83c7-49eb-a0fb-404e6dc2150e,bf43cc29-c3c1-4f3a-9a6c-deb902ca286c,CONCLUDED


### General Checks

In [5]:
from pyspark.sql.functions import isnan, when, count, col

print("Min Date: {0}".format(order_status.agg({"created_at": "min"}).collect()[0][0]))
print("Max Date: {0}".format(order_status.agg({"created_at": "max"}).collect()[0][0]))
print("Distinct value: {0}".format(', '.join([i.value for i in order_status.select('value').distinct().collect()])))
print("Max Records Per Order: {0}".format(order_status.groupBy('order_id').count().select('count').agg({"count": "max"}).collect()[0][0]))
print("Max Records Per Status: {0}".format(order_status.groupBy('status_id').count().select('count').agg({"count": "max"}).collect()[0][0]))

order_status.select([count(when(col(c).isNull(), c)).alias(c) for c in order_status.columns]).toPandas().head()

Min Date: 2019-01-01 00:00:00
Max Date: 2019-01-31 23:59:59
Distinct value: CONCLUDED, REGISTERED, CANCELLED, PLACED
Max Records Per Order: 10
Max Records Per Status: 2


Unnamed: 0,created_at,order_id,status_id,value
0,0,0,0,0


### Conclusions
* Only data from 2019 available: It seems the duplicated orders previously seen on Order Dataset (from 2018) are invalid.
* There are duplicated rows, since a order has a max of 4 states

### Saving dataframe on tempTable for next analysis

In [6]:
order_status.registerTempTable("order_status_df")

In [7]:
query = """
WITH most_entries AS (
    SELECT
        order_id,
        COUNT(*) AS duplicates
    FROM order_status_df
    GROUP BY order_id
    ORDER BY duplicates DESC
    LIMIT 1
)

SELECT
    *
FROM order_status_df
WHERE order_id IN (SELECT order_id FROM most_entries)
"""

spark.sql(query).toPandas().head(10)

Unnamed: 0,created_at,order_id,status_id,value
0,2019-01-09 23:43:32,5387d11d-2d17-4242-bd45-ed9efb6eee6c,0647b4f9-f528-4a7c-a26d-456b1ae2eb23,PLACED
1,2019-01-09 23:43:32,5387d11d-2d17-4242-bd45-ed9efb6eee6c,0647b4f9-f528-4a7c-a26d-456b1ae2eb23,PLACED
2,2019-01-09 23:43:31,5387d11d-2d17-4242-bd45-ed9efb6eee6c,f1b28506-61b8-4314-beb5-efe91f7cd0ef,REGISTERED
3,2019-01-09 23:43:31,5387d11d-2d17-4242-bd45-ed9efb6eee6c,f1b28506-61b8-4314-beb5-efe91f7cd0ef,REGISTERED
4,2019-01-11 23:43:32,5387d11d-2d17-4242-bd45-ed9efb6eee6c,a8b04a63-1e56-470c-b1c8-c7cb28aef9c2,CONCLUDED
5,2019-01-11 23:43:32,5387d11d-2d17-4242-bd45-ed9efb6eee6c,a8b04a63-1e56-470c-b1c8-c7cb28aef9c2,CONCLUDED
6,2019-01-12 23:43:32,5387d11d-2d17-4242-bd45-ed9efb6eee6c,cf71f315-c364-4864-adeb-34601d6ef6ed,CANCELLED
7,2019-01-12 23:43:32,5387d11d-2d17-4242-bd45-ed9efb6eee6c,cf71f315-c364-4864-adeb-34601d6ef6ed,CANCELLED
8,2019-01-10 23:43:32,5387d11d-2d17-4242-bd45-ed9efb6eee6c,b78a224f-88dd-47e3-91b8-04962cdf5a52,PLACED
9,2019-01-10 23:43:32,5387d11d-2d17-4242-bd45-ed9efb6eee6c,b78a224f-88dd-47e3-91b8-04962cdf5a52,PLACED


In [8]:
query = """
WITH most_entries AS (
    SELECT
        status_id,
        COUNT(*) AS duplicates
    FROM order_status_df
    GROUP BY status_id
    ORDER BY duplicates DESC
    LIMIT 1
)

SELECT
    *
FROM order_status_df
WHERE status_id IN (SELECT status_id FROM most_entries)
"""

spark.sql(query).toPandas().head()

Unnamed: 0,created_at,order_id,status_id,value
0,2019-01-18 22:50:16,05a250d9-7bae-48ed-8b5d-e375a5a1cbc2,d3a88a98-6678-433b-a328-bc329907469e,PLACED
1,2019-01-18 22:50:16,05a250d9-7bae-48ed-8b5d-e375a5a1cbc2,d3a88a98-6678-433b-a328-bc329907469e,PLACED


### Conclusions
* There are multiple status_id for a order_id_value. Needs further investigation on a status details table ? 
* Drop duplicates should fix it

In [9]:
order_status = order_status.dropDuplicates()
print("Max Records Per Order: {0}".format(order_status.groupBy('order_id').count().select('count').agg({"count": "max"}).collect()[0][0]))
print("Max Records Per Status: {0}".format(order_status.groupBy('status_id').count().select('count').agg({"count": "max"}).collect()[0][0]))

Max Records Per Order: 5
Max Records Per Status: 1
