In [1]:
#USER_FLAG = "--user"
#!pip3 install {USER_FLAG} kfp --upgrade
#!pip3 install {USER_FLAG} google_cloud_pipeline_components --upgrade
#!pip3 install {USER_FLAG} 'apache-beam[gcp]'

In [2]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"

KFP SDK version: 2.0.1


In [3]:
! python3 -c "from google.cloud import aiplatform; print('aiplatform SDK version: {}'.format(aiplatform.__version__))"

aiplatform SDK version: 1.25.0


# 0.0 Imports

In [4]:
from typing import NamedTuple

from kfp.v2 import dsl
from kfp.v2.dsl import (pipeline,
                        Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        Markdown)

from kfp.v2 import compiler


from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

from google_cloud_pipeline_components import aiplatform as gcc_aip

  This is separate from the ipykernel package so we can avoid doing imports until


ImportError: cannot import name 'aiplatform' from 'google_cloud_pipeline_components' (/home/jupyter/.local/lib/python3.7/site-packages/google_cloud_pipeline_components/__init__.py)

In [None]:
pipeline_root = 'gs://bucket_pipeline'

# 1.0 Data Capture

In [None]:
@component(packages_to_install=["google-cloud-bigquery","db-dtypes", "pandas"],
          base_image="python:3.10.6",
          output_component_file="captura_dados.yaml")
def captura_dados():
    import logging

    import pandas as pd
    from google.cloud import bigquery
    
    PROJECT_ID = "gcp-vertex"
    DATASET_ID = "gcp_bq"
    TABLE_RAW_ID = "dados_ecommerce_raw"
    TABLE_ID = "ecommerce_cds"
    
    def run_bq_query(sql: str, project_name: str) -> Union[str, pd.DataFrame]:
        """
        Run a BigQuery query and return the job ID or result as a DataFrame
        Args:
            sql: SQL query, as a string, to execute in BigQuery
        Returns:
            df: DataFrame of results from query,  or error, if any
        """
        
        bq_client = bigquery.Client(project=project_name)

        # Try dry run before executing query to catch any errors
        job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
        bq_client.query(sql, job_config=job_config)

        # If dry run succeeds without errors, proceed to run query
        job_config = bigquery.QueryJobConfig()
        client_result = bq_client.query(sql, job_config=job_config)

        job_id = client_result.job_id

        # Wait for query/job to finish running. then get & return data frame
        df = client_result.result().to_arrow().to_pandas()
        print(f"Finished job_id: {job_id}")
        
        return df
    
    
    query = f"""
                CREATE OR REPLACE TABLE
               `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}` (invoice_no STRING,
                stock_code STRING,
                description STRING,
                quantity INT64,
                invoice_date DATE,
                unit_price FLOAT64,
                customer_id FLOAT64,
                country STRING)
            PARTITION BY
              invoice_date AS (
              WITH
                not_nulls AS (
                SELECT
                  *
                FROM
                  `{PROJECT_ID}.{DATASET_ID}.{TABLE_RAW_ID}`
                WHERE
                  invoice_date <= CURRENT_DATE()
                  AND customer_id IS NOT NULL
                  AND description IS NOT NULL),
                filtering_features AS (
                SELECT
                  *
                FROM
                  not_nulls
                WHERE
                  unit_price >= 0.04
                  AND country NOT IN ('European Community',
                    'Unspecified')
                  AND stock_code NOT IN ('POST',
                    'D',
                    'DOT',
                    'M',
                    'S',
                    'AMAZONFEE',
                    'm',
                    'DCGSSBOY',
                    'DCGSSGIRL',
                    'PADS',
                    'B',
                    'CRUK')
                  AND customer_id != 16446)
              SELECT
                *
              FROM
                filtering_features);
    """
    
    run_bq_query(query, project_name=PROJECT_ID)
    logging.info(f'Tabela criada: {PROJECT_ID}.{DATASET_ID}.{TABLE_ID}')