### Inspecting Restaurant RAW DATA

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', -1)

In [2]:
restaurant = spark.read.csv('s3n://ifood-data-architect-test-source/restaurant.csv.gz')
restaurant.limit(5).toPandas().head()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11
0,id,created_at,enabled,price_range,average_ticket,takeout_time,delivery_time,minimum_order_value,merchant_zip_code,merchant_city,merchant_state,merchant_country
1,02c94103-61f3-4906-a4a9-55611db9f28c,2017-01-23T12:52:30.910Z,false,3,60.0,0,50,30.0,14025,RIBEIRAO PRETO,SP,BR
2,15e7f5fd-090d-47b9-9f14-b6f7fce3c95d,2017-01-20T13:14:48.286Z,true,3,60.0,0,0,30.0,50180,SAO PAULO,SP,BR
3,33ca5d3d-b99f-404d-84d9-8df8f38a2261,2017-01-23T12:46:33.457Z,true,5,100.0,0,45,10.0,23090,RIO DE JANEIRO,RJ,BR
4,4927035f-a343-4a65-a9be-945818e2efff,2017-01-20T13:15:04.806Z,true,3,80.0,0,0,18.9,40255,SALVADOR,BA,BR


In [3]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType, TimestampType, BooleanType

schema = StructType([
    StructField("id", StringType()),
    StructField("created_at", TimestampType()),
    StructField("enabled", BooleanType()),
    StructField("price_range", IntegerType()),
    StructField("average_ticket", DoubleType()),
    StructField("takeout_time", IntegerType()),
    StructField("delivery_time", IntegerType()),
    StructField("minimum_order_value", DoubleType()),
    StructField("merchant_zip_code", StringType()),
    StructField("merchant_city", StringType()),
    StructField("merchant_state", StringType()),
    StructField("merchant_country", StringType())
])

restaurant = spark.read.csv('s3n://ifood-data-architect-test-source/restaurant.csv.gz', header=True, schema=schema)
restaurant.limit(5).toPandas().head()

Unnamed: 0,id,created_at,enabled,price_range,average_ticket,takeout_time,delivery_time,minimum_order_value,merchant_zip_code,merchant_city,merchant_state,merchant_country
0,02c94103-61f3-4906-a4a9-55611db9f28c,2017-01-23 12:52:30.910,False,3,60.0,0,50,30.0,14025,RIBEIRAO PRETO,SP,BR
1,15e7f5fd-090d-47b9-9f14-b6f7fce3c95d,2017-01-20 13:14:48.286,True,3,60.0,0,0,30.0,50180,SAO PAULO,SP,BR
2,33ca5d3d-b99f-404d-84d9-8df8f38a2261,2017-01-23 12:46:33.457,True,5,100.0,0,45,10.0,23090,RIO DE JANEIRO,RJ,BR
3,4927035f-a343-4a65-a9be-945818e2efff,2017-01-20 13:15:04.806,True,3,80.0,0,0,18.9,40255,SALVADOR,BA,BR
4,52feaad8-4961-4afc-8d60-3f29ffd0a7a7,2017-01-20 13:14:27.701,True,3,60.0,0,0,25.0,64600,BARUERI,SP,BR


### General Checks

In [4]:
from pyspark.sql.functions import isnan, when, count, col

print("Min Date: {0}".format(restaurant.agg({"created_at": "min"}).collect()[0][0]))
print("Max Date: {0}".format(restaurant.agg({"created_at": "max"}).collect()[0][0]))
print("Max Duplicate Restaurant Ids: {0}".format(restaurant.groupBy('id').count().select('count').agg({"count": "max"}).collect()[0][0]))
print("Distinct price_ranges: {0}".format(', '.join([str(i.price_range) for i in restaurant.select('price_range').distinct().collect()])))
print("Distinct merchant_country: {0}".format(', '.join([i.merchant_country for i in restaurant.select('merchant_country').distinct().collect()])))
print("Distinct merchant_state: {0}".format(', '.join([i.merchant_state for i in restaurant.select('merchant_state').distinct().collect()])))

restaurant.select([count(when(col(c).isNull(), c)).alias(c) for c in restaurant.columns]).toPandas().head()

Min Date: 2017-01-20 13:12:43.554000
Max Date: 2017-01-23 12:54:52.155000
Max Duplicate Restaurant Ids: 1
Distinct price_ranges: 1, 3, 5, 4, 2
Distinct merchant_country: BR
Distinct merchant_state: SC, PI, AM, GO, MT, SP, PB, ES, RS, MS, AL, MG, PA, BA, SE, PE, CE, RN, RJ, MA, AC, DF, PR


Unnamed: 0,id,created_at,enabled,price_range,average_ticket,takeout_time,delivery_time,minimum_order_value,merchant_zip_code,merchant_city,merchant_state,merchant_country
0,0,0,0,0,0,0,1,95,0,0,0,0


### Conclusions
* No duplicates
* All restaurants created early 2017
* One restaurant without delivery_time
* A few restaurantds without minimum_order_value
* price_range is actually a string (category)