CREATE SAMPLES BRONZE

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [9]:
df = spark.read.parquet("s3a://pedro-datalake-project/bronze/customers/")
df.show(5)
df.printSchema()

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           sao paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     mogi das cruzes|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            campinas|            SP|
+--------------------+--------------------+------------------------+--------------------+--------------+
only showing top 5 rows

root
 |-- customer_id: string 

In [11]:
df.count()


99441

In [12]:
missing_df = df.select([
    F.sum(F.col(c).isNull().cast("int")).alias(c) 
    for c in df.columns
])

missing_df.show(vertical=True)


-RECORD 0-----------------------
 customer_id              | 0   
 customer_unique_id       | 0   
 customer_zip_code_prefix | 0   
 customer_city            | 0   
 customer_state           | 0   



In [13]:
dupes = (
    df.groupBy("customer_id")
      .count()
      .filter(F.col("count") > 1)
)

dupes.show()
dupes.count()


+-----------+-----+
|customer_id|count|
+-----------+-----+
+-----------+-----+



0

In [15]:
df.groupBy("customer_state").count().orderBy(F.col("count").desc()).show()


+--------------+-----+
|customer_state|count|
+--------------+-----+
|            SP|41746|
|            RJ|12852|
|            MG|11635|
|            RS| 5466|
|            PR| 5045|
|            SC| 3637|
|            BA| 3380|
|            DF| 2140|
|            ES| 2033|
|            GO| 2020|
|            PE| 1652|
|            CE| 1336|
|            PA|  975|
|            MT|  907|
|            MA|  747|
|            MS|  715|
|            PB|  536|
|            PI|  495|
|            RN|  485|
|            AL|  413|
+--------------+-----+
only showing top 20 rows



In [16]:
df.groupBy("customer_city").count().orderBy(F.col("count").desc()).show(20)


+--------------------+-----+
|       customer_city|count|
+--------------------+-----+
|           sao paulo|15540|
|      rio de janeiro| 6882|
|      belo horizonte| 2773|
|            brasilia| 2131|
|            curitiba| 1521|
|            campinas| 1444|
|        porto alegre| 1379|
|            salvador| 1245|
|           guarulhos| 1189|
|sao bernardo do c...|  938|
|             niteroi|  849|
|         santo andre|  797|
|              osasco|  746|
|              santos|  713|
|             goiania|  692|
| sao jose dos campos|  691|
|           fortaleza|  654|
|            sorocaba|  633|
|              recife|  613|
|       florianopolis|  570|
+--------------------+-----+
only showing top 20 rows



In [17]:
df.filter(F.length(F.col("customer_zip_code_prefix")) < 5).show(20)


+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           sao paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     mogi das cruzes|            SP|
|fd826e7cf63160e53...|addec96d2e059c80c...|                    4534|           sao paulo|            SP|
|eabebad39a88bb6f5...|295c05e81917928d7...|                    5704|           sao paulo|            SP|
|c5c61596a3b6bd0ce...|b6e99561fe6f34a55...|                    7124|           guarulhos|            SP|
|9b8ce803689b3562d...|7f3a72e8f988c6e73...|            

In [18]:
df.filter(
    F.col("customer_id").isNull() |
    F.col("customer_city").isNull() |
    F.col("customer_state").isNull()
).show(truncate=False)


+-----------+------------------+------------------------+-------------+--------------+
|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+-----------+------------------+------------------------+-------------+--------------+
+-----------+------------------+------------------------+-------------+--------------+



In [19]:
df_silver = (
    df
    .withColumn(
        "customer_zip_code_prefix",
        F.format_string("%05d", F.col("customer_zip_code_prefix"))
    )
    .withColumn(
        "customer_city",
        F.initcap(F.trim(F.col("customer_city")))
    )
    .withColumn(
        "customer_state",
        F.upper(F.col("customer_state"))
    )
    .withColumn(
        "audit_timestamp",
        F.current_timestamp()
    )
)


In [20]:
df_silver.select("customer_zip_code_prefix").show(10)


+------------------------+
|customer_zip_code_prefix|
+------------------------+
|                   14409|
|                   09790|
|                   01151|
|                   08775|
|                   13056|
|                   89254|
|                   04534|
|                   35182|
|                   81560|
|                   30575|
+------------------------+
only showing top 10 rows



In [21]:
df_silver.select("customer_state").distinct().show()


+--------------+
|customer_state|
+--------------+
|            SC|
|            RO|
|            PI|
|            AM|
|            RR|
|            GO|
|            TO|
|            MT|
|            SP|
|            ES|
|            PB|
|            RS|
|            MS|
|            AL|
|            MG|
|            PA|
|            BA|
|            SE|
|            PE|
|            CE|
+--------------+
only showing top 20 rows



In [22]:
df_silver.select("customer_city").show(10)


+--------------------+
|       customer_city|
+--------------------+
|              Franca|
|Sao Bernardo Do C...|
|           Sao Paulo|
|     Mogi Das Cruzes|
|            Campinas|
|      Jaragua Do Sul|
|           Sao Paulo|
|             Timoteo|
|            Curitiba|
|      Belo Horizonte|
+--------------------+
only showing top 10 rows

