CREATE SAMPLES BRONZE

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [3]:
import os

tables = [
    "customers",
    "orders",
    "items",
    "products",
    "sellers",
    "geolocation",
    "payments",
    "category"
]

output_folder = "SAMPLES-PARQUET"
os.makedirs(output_folder, exist_ok=True)


for table in tables:
    path = f"s3a://pedro-datalake-project/bronze/{table}/"

    df = spark.read.parquet(path)

    sample_df = df.limit(30)

    base_path = os.getcwd().replace("\\", "/")
    local_path = f"file:///{base_path}/{output_folder}/{table}_sample.parquet"

    sample_df.write.mode("overwrite").parquet(local_path)

    print(f"Sample salvo em: {local_path}")


Sample salvo em: file:///c:/Projects/Olist_Project_ETL_AWS/BRONZE-LAYER/Notebooks/SAMPLES-PARQUET/customers_sample.parquet
Sample salvo em: file:///c:/Projects/Olist_Project_ETL_AWS/BRONZE-LAYER/Notebooks/SAMPLES-PARQUET/orders_sample.parquet
Sample salvo em: file:///c:/Projects/Olist_Project_ETL_AWS/BRONZE-LAYER/Notebooks/SAMPLES-PARQUET/items_sample.parquet
Sample salvo em: file:///c:/Projects/Olist_Project_ETL_AWS/BRONZE-LAYER/Notebooks/SAMPLES-PARQUET/products_sample.parquet
Sample salvo em: file:///c:/Projects/Olist_Project_ETL_AWS/BRONZE-LAYER/Notebooks/SAMPLES-PARQUET/sellers_sample.parquet
Sample salvo em: file:///c:/Projects/Olist_Project_ETL_AWS/BRONZE-LAYER/Notebooks/SAMPLES-PARQUET/geolocation_sample.parquet
Sample salvo em: file:///c:/Projects/Olist_Project_ETL_AWS/BRONZE-LAYER/Notebooks/SAMPLES-PARQUET/payments_sample.parquet
Sample salvo em: file:///c:/Projects/Olist_Project_ETL_AWS/BRONZE-LAYER/Notebooks/SAMPLES-PARQUET/category_sample.parquet


In [13]:
category = spark.read.parquet("C:\Projects\Olist_Project_ETL_AWS\BRONZE-LAYER\SAMPLES-PARQUET\category_sample.parquet")
category.show(truncate=False)
category.printSchema()

+---------------------------+-----------------------------+
|col0                       |col1                         |
+---------------------------+-----------------------------+
|product_category_name      |product_category_name_english|
|beleza_saude               |health_beauty                |
|informatica_acessorios     |computers_accessories        |
|automotivo                 |auto                         |
|cama_mesa_banho            |bed_bath_table               |
|moveis_decoracao           |furniture_decor              |
|esporte_lazer              |sports_leisure               |
|perfumaria                 |perfumery                    |
|utilidades_domesticas      |housewares                   |
|telefonia                  |telephony                    |
|relogios_presentes         |watches_gifts                |
|alimentos_bebidas          |food_drink                   |
|bebes                      |baby                         |
|papelaria                  |stationery 

In [12]:
customers = spark.read.parquet("C:\Projects\Olist_Project_ETL_AWS\BRONZE-LAYER\SAMPLES-PARQUET\customers_sample.parquet")
customers.show(truncate=False)
customers.printSchema()

+--------------------------------+--------------------------------+------------------------+---------------------+--------------+
|customer_id                     |customer_unique_id              |customer_zip_code_prefix|customer_city        |customer_state|
+--------------------------------+--------------------------------+------------------------+---------------------+--------------+
|06b8999e2fba1a1fbc88172c00ba8bc7|861eff4711a542e4b93843c6dd7febb0|14409                   |franca               |SP            |
|18955e83d337fd6b2def6b18a428ac77|290c77bc529b7ac935b93aa66c333dc3|9790                    |sao bernardo do campo|SP            |
|4e7b3e00288586ebd08712fdd0374a03|060e732b5b29e8181a18229c7b0b2b5e|1151                    |sao paulo            |SP            |
|b2b6027bc5c5109e529d4dc6358b12c3|259dac757896d24d7702b9acbbff3f3c|8775                    |mogi das cruzes      |SP            |
|4f2d8ab171c80ec8364f7c12e35b23ad|345ecd01c38d18a9036ed96c73b8d066|13056                  

In [11]:
geo = spark.read.parquet("C:\Projects\Olist_Project_ETL_AWS\BRONZE-LAYER\SAMPLES-PARQUET\geolocation_sample.parquet")
geo.show(truncate=False)
geo.printSchema()


+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|geolocation_lat    |geolocation_lng    |geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|1037                       |-23.54562128115268 |-46.63929204800168 |sao paulo       |SP               |
|1046                       |-23.546081127035535|-46.64482029837157 |sao paulo       |SP               |
|1046                       |-23.54612896641469 |-46.64295148361138 |sao paulo       |SP               |
|1041                       |-23.5443921648681  |-46.63949930627844 |sao paulo       |SP               |
|1035                       |-23.541577961711493|-46.64160722329613 |sao paulo       |SP               |
|1012                       |-23.547762303364266|-46.63536053788448 |são paulo       |SP               |
|1047                       |-23.546273112412678|-46.64

In [17]:
items = spark.read.parquet("C:\Projects\Olist_Project_ETL_AWS\BRONZE-LAYER\SAMPLES-PARQUET\items_sample.parquet")
items.show(truncate=False)
items.printSchema()

+--------------------------------+-------------+--------------------------------+--------------------------------+-------------------+------+-------------+
|order_id                        |order_item_id|product_id                      |seller_id                       |shipping_limit_date|price |freight_value|
+--------------------------------+-------------+--------------------------------+--------------------------------+-------------------+------+-------------+
|00010242fe8c5a6d1ba2dd792cb16214|1            |4244733e06e7ecb4970a6e2683c13e61|48436dade18ac8b2bce089ec2a041202|2017-09-19 09:45:35|58.9  |13.29        |
|00018f77f2f0320c557190d7a144bdd3|1            |e5f2d52b802189ee658865ca93d83a8f|dd7ddc04e1b6c2c614352b383efe2d36|2017-05-03 11:05:13|239.9 |19.93        |
|000229ec398224ef6ca0657da4fc703e|1            |c777355d18b72b67abbeef9df44fd0fd|5b51032eddd242adc84c38acab88f23d|2018-01-18 14:48:30|199.0 |17.87        |
|00024acbcdf0a6daa1e931b038114c75|1            |7634da152a4610f1

In [18]:
orders = spark.read.parquet("C:\Projects\Olist_Project_ETL_AWS\BRONZE-LAYER\SAMPLES-PARQUET\orders_sample.parquet")
orders.show(truncate=False)
orders.printSchema()

+--------------------------------+--------------------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|col0                            |col1                            |col2        |col3                    |col4               |col5                        |col6                         |col7                         |
+--------------------------------+--------------------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|order_id                        |customer_id                     |order_status|order_purchase_timestamp|order_approved_at  |order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
|e481f51cbdc54678b7cc49136f2d6af7|9ef432eb6251297304e76186b10a928d|delivered   |2017-10-02 10:56:33     |2017-10-02 11:07:15|2017-10-04 19:5

In [19]:
payments = spark.read.parquet("C:\Projects\Olist_Project_ETL_AWS\BRONZE-LAYER\SAMPLES-PARQUET\payments_sample.parquet")
payments.show(truncate=False)
payments.printSchema()

+--------------------------------+------------------+------------+--------------------+-------------+
|order_id                        |payment_sequential|payment_type|payment_installments|payment_value|
+--------------------------------+------------------+------------+--------------------+-------------+
|b81ef226f3fe1789b1e8b2acac839d17|1                 |credit_card |8                   |99.33        |
|a9810da82917af2d9aefd1278f1dcfa0|1                 |credit_card |1                   |24.39        |
|25e8ea4e93396b6fa0d3dd708e76c1bd|1                 |credit_card |1                   |65.71        |
|ba78997921bbcdc1373bb41e913ab953|1                 |credit_card |8                   |107.78       |
|42fdf880ba16b47b59251dd489d4441a|1                 |credit_card |2                   |128.45       |
|298fcdf1f73eb413e4d26d01b25bc1cd|1                 |credit_card |2                   |96.12        |
|771ee386b001f06208a7419e4fc1bbd7|1                 |credit_card |1               

In [20]:
products = spark.read.parquet("C:\Projects\Olist_Project_ETL_AWS\BRONZE-LAYER\SAMPLES-PARQUET\products_sample.parquet")
products.show(truncate=False)
products.printSchema()

+--------------------------------+--------------------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|product_id                      |product_category_name           |product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------------------+--------------------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|1e9e8ef04dbcff4541ed26657ea517e5|perfumaria                      |40                 |287                       |1                 |225             |16               |10               |14              |
|3aa071139cb16b67ca9e5dea641aaa2f|artes                           |44                 |276                       |1                 |1000            |30               |18              

In [21]:
sellers = spark.read.parquet("C:\Projects\Olist_Project_ETL_AWS\BRONZE-LAYER\SAMPLES-PARQUET\sellers_sample.parquet")
sellers.show(truncate=False)
sellers.printSchema()

+--------------------------------+----------------------+-----------------+------------+
|seller_id                       |seller_zip_code_prefix|seller_city      |seller_state|
+--------------------------------+----------------------+-----------------+------------+
|3442f8959a84dea7ee197c632cb2df15|13023                 |campinas         |SP          |
|d1b65fc7debc3361ea86b5f14c68d2e2|13844                 |mogi guacu       |SP          |
|ce3ad9de960102d0677a81f5d0bb7b2d|20031                 |rio de janeiro   |RJ          |
|c0f3eea2e14555b6faeea3dd58c1b1c3|4195                  |sao paulo        |SP          |
|51a04a8a6bdcb23deccc82b0b80742cf|12914                 |braganca paulista|SP          |
|c240c4061717ac1806ae6ee72be3533b|20920                 |rio de janeiro   |RJ          |
|e49c26c3edfa46d227d5121a6b6e4d37|55325                 |brejao           |PE          |
|1b938a7ec6ac5061a66a3766e0e75f90|16304                 |penapolis        |SP          |
|768a86e36ad6aae3d03e