# Stone Tech Challenge

## Libraries e modules imports

In [2]:
import duckdb
import os
import polars as pl
from datetime import date
from dotenv import load_dotenv
from google.cloud import storage

## Configuration

In [3]:
load_dotenv('./config_files//conf.env')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join(os.getcwd(), 'config_files//key.json')
parquet_location = 'desafio-eng-dados'
prefix = str(date.today().year)

## Data extraction from GCS bucket

In [4]:


def download_blobs(bucket_name, prefix):

  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_name)
  blobs = bucket.list_blobs(prefix=prefix)

  for blob in blobs:
    blob_name = blob.name
    blob_path = os.path.join(os.getcwd(), 'blobs/', blob_name)
    blob.download_to_filename(blob_path)


download_blobs(parquet_location, prefix)

In [20]:
#Teste


def download_blobs(bucket_name, prefix):

  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_name)
  blobs = bucket.list_blobs(prefix=prefix)

  for blob in blobs:
    blob_name = blob.name
    if blob_name.endswith('pq'):
      blob_path = os.path.join(os.getcwd(), 'blobs/tabular_data', blob_name)
      blob.download_to_filename(blob_path)
    else:
      blob_path = os.path.join(os.getcwd(), 'blobs/evidences', blob_name)
      blob.download_to_filename(blob_path)


download_blobs(tabular_data_file_location, prefix)
download_blobs(image_data_location, prefix)

KeyboardInterrupt: 

## Persisting data into PostgreSQL using DuckDB

### DuckDB connection

In [6]:
conn = duckdb.connect()

user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
dbname = os.getenv('DB_NAME')

conn.execute(f"ATTACH 'postgresql://{user}:{password}@{host}:{port}/{dbname}' AS db (TYPE POSTGRES);")

<duckdb.duckdb.DuckDBPyConnection at 0x72515022d3b0>

### stg_orders

In [7]:
conn.execute("""
INSERT INTO db.staging.stg_orders (
    order_number, terminal_id, customer_id, cancellation_reason, last_modified_date, arrival_date, deadline_date
)
SELECT
    order_number, terminal_id, customer_id, cancellation_reason, last_modified_date, arrival_date, deadline_date
FROM read_parquet('blobs/2024*')
""")

<duckdb.duckdb.DuckDBPyConnection at 0x72515022d3b0>

### stg_terminals

In [9]:
conn.execute("""
INSERT INTO db.staging.stg_terminals (
    terminal_id, terminal_serial_number, terminal_model, terminal_type
)
SELECT
    distinct(terminal_id), terminal_serial_number, terminal_model, terminal_type
FROM read_parquet('blobs/2024*')
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7433bc23a770>

### stg_customers

In [8]:
conn.execute("""
INSERT INTO db.staging.stg_customers (
    customer_id, customer_phone, city, country, country_state, zip_code, street_name, complement, neighborhood
)
SELECT
    distinct(customer_id), customer_phone, city, country, country_state, zip_code, street_name, complement, neighborhood
FROM read_parquet('blobs/2024*')
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7433bc23a770>

### stg_providers

In [39]:
conn.execute("""
INSERT INTO db.staging.stg_providers (
    provider
)
SELECT
    distinct(provider)
FROM read_parquet('blobs/2024*')
""")

<duckdb.duckdb.DuckDBPyConnection at 0x72515022d3b0>

### stg_technicians

In [15]:
conn.execute("""
INSERT INTO db.staging.stg_technicians (
    technician_email
)
SELECT
    distinct(technician_email)
FROM read_parquet('blobs/2024*')
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7433bc23a770>

### testes

In [41]:
conn.execute("""
INSERT INTO db.staging.tabelao (
    customer_id, customer_phone, city, country, country_state, zip_code, street_name, complement, neighborhood,
             terminal_id, terminal_serial_number, terminal_model, terminal_type, order_number, 
             cancellation_reason, last_modified_date, arrival_date, deadline_date, provider, technician_email
)
SELECT
    customer_id, customer_phone, city, country, country_state, zip_code, street_name, complement, neighborhood,
             terminal_id, terminal_serial_number, terminal_model, terminal_type, order_number, 
             cancellation_reason, last_modified_date, arrival_date, deadline_date, provider, technician_email
             FROM read_parquet('blobs/2024*')
""")

<duckdb.duckdb.DuckDBPyConnection at 0x72515022d3b0>

In [None]:
duckdb.sql("""select count(order_number) as order_number_count, terminal_id from read_parquet('blobs/2024-03-06.pq')
           group by terminal_id order by order_number_count desc""")