### Inspecting Consumer RAW DATA

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', -1)

In [2]:
consumer = spark.read.csv('s3n://ifood-data-architect-test-source/consumer.csv.gz')
consumer.limit(5).toPandas().head()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6
0,customer_id,language,created_at,active,customer_name,customer_phone_area,customer_phone_number
1,00039466-560f-4e57-85a2-d4753cd901be,pt-br,2018-04-05T14:49:18.165Z,true,NUNO,46,816135924
2,001a1267-31a3-4f5b-a028-d7e323864b08,pt-br,2018-01-14T21:40:02.141Z,true,ADRIELLY,59,231330577
3,003ae1d5-67b8-4a04-b055-0e4e9622771a,pt-br,2018-01-07T03:47:15.554Z,true,PAULA,62,347597883
4,004629bf-c3fc-42f5-a133-fd34d2bd17fa,pt-br,2018-01-10T22:17:08.160Z,true,HELTON,13,719366842


### Setting Schema

In [3]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType, TimestampType, BooleanType

schema = StructType([
    StructField("customer_id", StringType()),
    StructField("language", StringType()),
    StructField("created_at", TimestampType()),
    StructField("active", BooleanType()),
    StructField("customer_name", StringType()),
    StructField("customer_phone_area", StringType()),
    StructField("customer_phone_number", StringType())
])

consumer = spark.read.csv('s3n://ifood-data-architect-test-source/consumer.csv.gz', header=True, schema=schema)
consumer.limit(5).toPandas().head()

Unnamed: 0,customer_id,language,created_at,active,customer_name,customer_phone_area,customer_phone_number
0,00039466-560f-4e57-85a2-d4753cd901be,pt-br,2018-04-05 14:49:18.165,True,NUNO,46,816135924
1,001a1267-31a3-4f5b-a028-d7e323864b08,pt-br,2018-01-14 21:40:02.141,True,ADRIELLY,59,231330577
2,003ae1d5-67b8-4a04-b055-0e4e9622771a,pt-br,2018-01-07 03:47:15.554,True,PAULA,62,347597883
3,004629bf-c3fc-42f5-a133-fd34d2bd17fa,pt-br,2018-01-10 22:17:08.160,True,HELTON,13,719366842
4,00467336-6561-4406-b6f2-987b06e77401,pt-br,2018-04-06 00:16:20.935,True,WENDER,76,543232158


### General Checks

In [4]:
from pyspark.sql.functions import isnan, when, count, col

print("Min Date: {0}".format(consumer.agg({"created_at": "min"}).collect()[0][0]))
print("Max Date: {0}".format(consumer.agg({"created_at": "max"}).collect()[0][0]))
print("Max Duplicate Customer Ids: {0}".format(consumer.groupBy('customer_id').count().select('count').agg({"count": "max"}).collect()[0][0]))
print("Distinct languages: {0}".format(', '.join([i.language for i in consumer.select('language').distinct().collect()])))

consumer.select([count(when(col(c).isNull(), c)).alias(c) for c in consumer.columns]).toPandas().head()

Min Date: 2018-01-02 18:07:23.460000
Max Date: 2018-04-06 05:11:12.946000
Max Duplicate Customer Ids: 1
Distinct languages: es-ar, pt-br


Unnamed: 0,customer_id,language,created_at,active,customer_name,customer_phone_area,customer_phone_number
0,0,0,0,0,310,0,0


### Conclusions
* No duplicates
* All users created early 2018 
* A few customers without name
* 2 languages