<a href="https://colab.research.google.com/github/MarianaDuartee/ProjetoFinal/blob/main/4_pubSub_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### INSTALANDO DEPENDENCIAS

In [None]:
!pip install google-cloud-pubsub
!pip install fsspec
!pip install gcsfs
!pip install apache_beam[gcp]
!pip install google-cloud-bigquery

### IMPORTANDO BIBLIOTECAS

In [2]:
import csv
import time
import os

import fsspec
import gcsfs
import pandas as pd

import apache_beam as beam
from apache_beam import window
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam import coders

from google.cloud import pubsub_v1
from google.cloud import storage
from google.cloud import bigquery

# Configurando conta de serviço
service_account_key = r"/content/soulcode-projeto-final-4b88bea6e07a.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_account_key

### PUB AND SUB (PUBLICANDO E CONSUMINDO DADOS)

Produtor

In [None]:
# Setando o tópico de entrada (ingestão)
topico = 'projects/soulcode-projeto-final/topics/ingestor_dados'
publisher = pubsub_v1.PublisherClient()

entrada = r"/content/2_temp_temp_pandas_total_pop_ano_uf.csv"

# Visualizando entrada dos dados
with open(entrada, 'rb') as file:
    for row in file:
        print('Publicando no topico: ', topico)
        publisher.publish(topico,row)
        time.sleep(2)

Consumidor

In [None]:
# Função de ACK (Aceite)
def monstrar_msg(mensagem):
  print(('Mensagem: {}'.format(mensagem)))
  mensagem.ack()

# Setando a subscrição de saída
subscription = 'projects/soulcode-projeto-final/subscriptions/consumidor_dados_violencia'
subscriber = pubsub_v1.SubscriberClient()

subscriber.subscribe(subscription,callback=monstrar_msg)

while True:
  time.sleep(2)

### PIPELINE TO GOOGLE CLOUD STORAGE (STREAMING)

In [72]:
# Definindo opções da Pipeline
pipeline_args=['--runner=DataflowRunner',
               '--job_name=bq-load',
               '--project=soulcode-projeto-final',
               '--region=southamerica-east1',
               '--temp_location=gs://data_lake_ingest_data/temp_process',
               '--staging_location=gs://data_lake_ingest_data/temp_process',
               '--template_location=gs://data_lake_ingest_data/4_templates/data_streaming',
               '--streaming'
               ]

# Atribuindo opções ao pipeline
options = PipelineOptions(pipeline_args)
p1 = beam.Pipeline(options=options)

# Modelo de Pipeline
rows = (
    p1
    |'Extrair os dados' >> beam.io.ReadFromText('gs://data_lake_ingest_data/1_input/OcorrenciasUF.json', skip_header_lines=False, coder=coders.StrUtf8Coder())
    |'Separador' >> beam.Map(lambda record: record.split(','))
    |'Window' >> beam.WindowInto(window.SlidingWindows(10,5))
    |'Saida de dados' >> beam.FlatMap(print)
    |'Gravar resultado' >> beam.io.WriteToText('gs://data_lake_ingest_data/temp/TEST_', header=True)
)

p1.run()



<DataflowPipelineResult None at 0x7f7ce6685050>

### PIPELINE TO BIG QUERY (BATCH)

In [None]:
pipeline_args=['--runner=DataflowRunner',
               '--job_name=bq-load',
               '--project=soulcode-projeto-final',
               '--region=southamerica-east1',
               '--temp_location=gs://data_lake_ingest_data/temp_process',
               '--staging_location=gs://data_lake_ingest_data/temp_process',
               '--template_location=gs://data_lake_ingest_data/4_templates/data_batch'
               ]

options = PipelineOptions(pipeline_args)
p1 = beam.Pipeline(options=options)

path_file = 'gs://data_lake_ingest_data/1_input/estimativa_pop_2016_2021.csv'

esquema = {'UF':str, 
           'COD. UF':int, 
           'COD. MUN':int,
           'NOME DO MUNICÍPIO':str,
           'POPULAÇÃO ESTIMADA':int,
           'ANO':int}

rows = (
        p1 
        
        |"Extraindo_Dados" >> beam.io.ReadFromText(
                        file_pattern=path_file, 
                        coder=coders.BytesCoder(),
                        skip_header_lines=0)
        |"Saida_Dados" >> beam.Map(print)
        |"Gravar_Resultado" >> beam.io.WriteToBigQuery(
                                   table='Teste_Beam_Apache',
                                   dataset='Teste',
                                   project='soulcode-projeto-final',
                                   custom_gcs_temp_location='gs://data_lake_ingest_data/temp_process',
                                   method='FILE_LOADS',
                                   schema=esquema,
                                   create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED,
                                   write_disposition=bigquery.WriteDisposition.WRITE_APPEND)
        )

p1.run()