# Stone Tech Challenge

## Libraries e modules imports

In [5]:
import duckdb
import os
import polars as pl
from datetime import date
from dotenv import load_dotenv
from google.cloud import storage

## Configuration

In [6]:
load_dotenv('./config_files//conf.env')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join(os.getcwd(), 'config_files//key.json')
bucket_name = 'desafio-eng-dados'
prefix = str(date.today().year)

## Data extraction from GCS bucket

In [8]:


def download(bucket_name, prefix):

  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_name)
  blobs = bucket.list_blobs(prefix=prefix)

  for blob in blobs:
    blob_name = blob.name
    blob_path = os.path.join(os.getcwd(), 'blobs/', blob_name)
    blob.download_to_filename(blob_path)


download(bucket_name, prefix)

## Persisting data into PostgreSQL using DuckDB

### DuckDB connection

In [9]:
conn = duckdb.connect()

user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
dbname = os.getenv('DB_NAME')

conn.execute(f"ATTACH 'postgresql://{user}:{password}@{host}:{port}/{dbname}' AS db (TYPE POSTGRES);")

<duckdb.duckdb.DuckDBPyConnection at 0x7ce1053e8770>

### stg_orders

In [25]:
conn.execute("""
INSERT INTO db.staging.stg_orders (
    order_number, terminal_id, customer_id, cancellation_reason, last_modified_date, arrival_date, deadline_date
)
SELECT
    order_number, terminal_id, customer_id, cancellation_reason, last_modified_date, arrival_date, deadline_date
FROM read_parquet('files/2024*')
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7472d9bfacf0>

### stg_terminals

In [26]:
conn.execute("""
INSERT INTO db.staging.stg_terminals (
    terminal_id, terminal_serial_number, terminal_model, terminal_type, provider, technician_email
)
SELECT
    distinct(terminal_id), terminal_serial_number, terminal_model, terminal_type, provider, technician_email
FROM read_parquet('files/2024*')
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7472d9bfacf0>

### stg_customers

In [27]:
conn.execute("""
INSERT INTO db.staging.stg_customers (
    customer_id, customer_phone, city, country, country_state, zip_code, street_name, complement, neighborhood
)
SELECT
    distinct(customer_id), customer_phone, city, country, country_state, zip_code, street_name, complement, neighborhood
FROM read_parquet('files/2024*')
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7472d9bfacf0>

In [20]:
duckdb.sql("select * from read_parquet('files/2024-03-06.pq') limit 20")

┌──────────────┬─────────────┬──────────────────────┬───┬───────────────┬──────────────────────┬────────────────────┐
│ order_number │ terminal_id │ terminal_serial_nu…  │ … │ deadline_date │ cancellation_reason  │ last_modified_date │
│    int64     │    int64    │       varchar        │   │     date      │       varchar        │        date        │
├──────────────┼─────────────┼──────────────────────┼───┼───────────────┼──────────────────────┼────────────────────┤
│      7780767 │           1 │ 8c2f5ca9-0de4-48fb…  │ … │ 2024-01-04    │ Cliente não conseg…  │ 2024-01-08         │
│      5109978 │           2 │ ea77eed3-d6df-4b31…  │ … │ 2024-01-05    │ O técnico teve um …  │ 2024-01-05         │
│      4061434 │           3 │ 3e4362cf-f481-4279…  │ … │ 2024-01-05    │ NULL                 │ 2024-01-03         │
│      3520100 │           4 │ c2c19921-ddcc-459e…  │ … │ 2024-01-09    │ O técnico teve um …  │ 2024-01-08         │
│      1291689 │           5 │ 76e7f9a9-a455-4ab3…  │ … 

In [None]:
uri = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'
query = 'SELECT * FROM staging.stg_customers WHERE street_name IS NULL'

df = pl.read_database_uri(
    query=query,
    uri=uri
)

df.replace_column()