<a href="https://colab.research.google.com/github/MarianaDuartee/ProjetoFinal/blob/main/1_pubSub_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### INSTALANDO DEPENDENCIAS

In [None]:
!pip install google-cloud-pubsub
!pip install fsspec
!pip install gcsfs
!pip install apache_beam[gcp]
!pip install google-cloud-bigquery

### IMPORTANDO BIBLIOTECAS

In [2]:
import csv
import time
import os

import fsspec
import gcsfs
import pandas as pd

import apache_beam as beam
from apache_beam import window
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam import coders

from google.cloud import pubsub_v1
from google.cloud import storage
from google.cloud import bigquery

# Configurando conta de serviço
service_account_key = r"/content/soulcode-projeto-final-4b88bea6e07a.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_account_key

### PUB AND SUB (PUBLICANDO E CONSUMINDO DADOS)

Produtor

In [None]:
# Setando o tópico de entrada (ingestão)
topico = 'projects/soulcode-projeto-final/topics/ingestor_dados'
publisher = pubsub_v1.PublisherClient()

entrada = r"/content/2_temp_temp_pandas_total_pop_ano_uf.csv"

# Visualizando entrada dos dados
with open(entrada, 'rb') as file:
    for row in file:
        print('Publicando no topico: ', topico)
        publisher.publish(topico,row)
        time.sleep(2)

Consumidor

In [None]:
# Função de ACK (Aceite)
def monstrar_msg(mensagem):
  print(('Mensagem: {}'.format(mensagem)))
  mensagem.ack()

# Setando a subscrição de saída
subscription = 'projects/soulcode-projeto-final/subscriptions/consumidor_dados_violencia'
subscriber = pubsub_v1.SubscriberClient()

subscriber.subscribe(subscription,callback=monstrar_msg)

while True:
  time.sleep(2)

### PIPELINE TO GOOGLE CLOUD STORAGE

In [50]:
# Abrindo Pipeline
p1 = beam.Pipeline()

# realizando a leitura e tratativa dos dados (se necessário)
mkt = (
    p1
    |'Extrair os dados' >> beam.io.ReadFromText('gs://data_lake_ingest_data/1_input/OcorrenciasUF.json', skip_header_lines=False, coder=coders.StrUtf8Coder())
    |'Separador' >> beam.Map(lambda record: record.split(','))
    |'Window' >> beam.WindowInto(window.SlidingWindows(10,5))
    |'Saida de dados' >> beam.FlatMap(print)
    |'Gravar resultado' >> beam.io.WriteToText('gs://data_lake_ingest_data/temp/TEST_', header=True)
)

p1.run()



['{"UF":{"0":"Acre"', '"1":"Acre"', '"2":"Acre"', '"3":"Acre"', '"4":"Acre"', '"5":"Acre"', '"6":"Acre"', '"7":"Acre"', '"8":"Acre"', '"9":"Acre"', '"10":"Acre"', '"11":"Acre"', '"12":"Acre"', '"13":"Acre"', '"14":"Acre"', '"15":"Acre"', '"16":"Acre"', '"17":"Acre"', '"18":"Acre"', '"19":"Acre"', '"20":"Acre"', '"21":"Acre"', '"22":"Acre"', '"23":"Acre"', '"24":"Acre"', '"25":"Acre"', '"26":"Acre"', '"27":"Acre"', '"28":"Acre"', '"29":"Acre"', '"30":"Acre"', '"31":"Acre"', '"32":"Acre"', '"33":"Acre"', '"34":"Acre"', '"35":"Acre"', '"36":"Acre"', '"37":"Acre"', '"38":"Acre"', '"39":"Acre"', '"40":"Acre"', '"41":"Acre"', '"42":"Acre"', '"43":"Acre"', '"44":"Acre"', '"45":"Acre"', '"46":"Acre"', '"47":"Acre"', '"48":"Acre"', '"49":"Acre"', '"50":"Acre"', '"51":"Acre"', '"52":"Acre"', '"53":"Acre"', '"54":"Alagoas"', '"55":"Alagoas"', '"56":"Alagoas"', '"57":"Alagoas"', '"58":"Alagoas"', '"59":"Alagoas"', '"60":"Alagoas"', '"61":"Alagoas"', '"62":"Alagoas"', '"63":"Alagoas"', '"64":"Alago



<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7f7ce9c04b10>

### PIPELINE TO BIG QUERY

In [None]:
path_file = 'gs://data_lake_ingest_data/1_input/estimativa_pop_2016_2021.csv'

p1 = beam.Pipeline()

esquema = {'UF':str, 
           'COD. UF':int, 
           'COD. MUN':int,
           'NOME DO MUNICÍPIO':str,
           'POPULAÇÃO ESTIMADA':int,
           'ANO':int}

data = (
    p1
    |'Extrair os dados' >> beam.io.ReadFromText(path_file, skip_header_lines=False, coder=coders.StrUtf8Coder())
    |"Window" >> beam.WindowInto(window.SlidingWindows(10,5))
    |"Separador" >> beam.Map(lambda x: x.split(','))
    |"Saida de dados" >> beam.Map(print)
    |"Gravar resultado" >> beam.io.Write(beam.io.WriteToBigQuery(
                                                table='Teste_Beam_Apache',
                                                dataset='Teste',
                                                project='soulcode-projeto-final',
                                                schema=esquema,
                                                custom_gcs_temp_location='gs://data_lake_ingest_data/temp_process',
                                                create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED,
                                                write_disposition=bigquery.WriteDisposition.WRITE_APPEND)))

p1.run()