### Link para acessar os datasets: https://data.boston.gov/dataset/311-service-requests

#### Extraindo os dados do dataset

In [9]:
import urllib.request
import pandas as pd

In [10]:
# Criando a função para extrair os dados do site e salvar em um arquivo

def extract_data(url, filename):
    try:
        urllib.request.urlretrieve(url, filename)
    
    except Exception as e:
        print("Error: ", e)

In [11]:
# Chamando a função para extrair os dados de 2018 a 2023

extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/ea2e4696-4a2d-429c-9807-d02eb92e0222/download/tmpcje3ep_w.csv', '../data/dados_2019.csv')
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/6ff6a6fd-3141-4440-a880-6f60a37fe789/download/tmpcv_10m2s.csv', '../data/dados_2020.csv')
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/f53ebccd-bc61-49f9-83db-625f209c95f5/download/tmp88p9g82n.csv', '../data/dados_2021.csv')
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/81a7b022-f8fc-4da5-80e4-b160058ca207/download/tmpfm8veglw.csv', '../data/dados_2022.csv')
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/e6013a93-1321-4f2a-bf91-8d8a02f1e62f/download/tmp182oee02.csv', '../data/dados_2023.csv')

KeyboardInterrupt: 

In [12]:
# Criando a lista com os arquivos extraídos do site para serem usados na função de leitura

arquivos = ['../data/dados_2019.csv', '../data/dados_2020.csv', '../data/dados_2021.csv', '../data/dados_2022.csv', '../data/dados_2023.csv']

In [13]:
# Criando um dicionário para armazenar os dataframes de cada arquivo com o ano como chave do dicionário

df = {}

for arquivo in arquivos:
    ano = arquivo.split('_')[-1].split('.')[0] # Pega o ano do arquivo para usar como chave no dicionário
    df[ano] = pd.read_csv(arquivo)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
# Exibindo os primeiros registros de cada dataframe
 
df['2019'].head()

Unnamed: 0,case_enquiry_id,open_dt,sla_target_dt,closed_dt,on_time,case_status,closure_reason,case_title,subject,reason,...,neighborhood,neighborhood_services_district,ward,precinct,location_street_name,location_zipcode,latitude,longitude,geom_4326,source
0,101002767874,2019-01-01 00:03:00,2019-01-03 03:30:00,2019-01-01 02:59:43,ONTIME,Closed,Case Closed. Closed date : 2019-01-01 07:59:43...,PRINTED : Knocked down/Eric Huynh,Public Works Department,Street Lights,...,Roxbury,13,Ward 14,1401.0,192 Magnolia St,2121.0,42.31199,-71.073181,0101000020E610000065AD9DFDAEC451C0051BFB45EF27...,Constituent Call
1,101002767875,2019-01-01 00:17:00,,2019-11-24 08:48:34,ONTIME,Closed,Case Closed. Closed date : 2019-11-24 13:48:34...,Fire Hydrant,Boston Water & Sewer Commission,Fire Hydrant,...,Back Bay,14,5,505.0,INTERSECTION Beaver St & Beacon St,,42.355555,-71.072049,0101000020E61000001A659F739CC451C0516121D7822D...,Constituent Call
2,101002767877,2019-01-01 00:31:51,2019-01-03 03:30:00,2019-01-01 21:27:40,ONTIME,Closed,Case Closed. Closed date : 2019-01-02 02:27:40...,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,...,Boston,3,03,,35 Fruit St,2114.0,42.362755,-71.069185,0101000020E61000009E8A6A866DC451C0F2243BBF6E2E...,Citizens Connect App
3,101002767878,2019-01-01 00:42:00,,2019-05-17 08:33:13,ONTIME,Closed,Case Closed. Closed date : 2019-05-17 12:33:13...,Police: Full Notifications,Mayor's 24 Hour Hotline,Notification,...,Dorchester,8,Ward 15,1503.0,35-37 Clarkson St,2125.0,42.30893,-71.066961,0101000020E61000007301041549C451C010DDC5008B27...,Constituent Call
4,101002767879,2019-01-01 01:09:12,2019-01-03 03:30:00,2019-01-01 21:28:11,ONTIME,Closed,Case Closed. Closed date : 2019-01-02 02:28:11...,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,...,East Boston,1,Ward 1,109.0,196 Trenton St,2128.0,42.380799,-71.03197,0101000020E6100000DAA350CD0BC251C0A6639A09BE30...,Citizens Connect App


__________________

#### Conectando com a conta de armazenamento da AWS

In [4]:
import os
from dotenv import load_dotenv

load_dotenv()

access_key = os.getenv('AWS_ACCESS_KEY_ID')
secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')

In [5]:
import boto3

aws_access_key_id = access_key # Chave de acesso da AWS
aws_secret_access_key = secret_key # Chave secreta da AWS
region_name = 'us-east-1' # Região da AWS


boto3.setup_default_session(aws_access_key_id=aws_access_key_id, 
                            aws_secret_access_key=aws_secret_access_key,
                            region_name=region_name) # Configuração da sessão da AWS com as chaves de acesso e a região

s3 = boto3.client('s3') # Cliente do S3 para acessar os métodos da AWS

In [6]:
conteudo = 'Testando colocando as chaves de acesso em variaveis de ambiente' # Conteúdo do arquivo teste a ser salvo no S3

with open('../data/testando-novamente.txt', 'w') as f:
    f.write(conteudo) # Escreve o conteúdo no arquivo teste.txt

In [7]:
s3.upload_file('../data/testando-novamente.txt', 'datalake-estudos', 'bronze-camada/teste-arquivo-variaveis') # Faz o upload do arquivo teste.txt para o bucket teste-arquivo

_____

#### Convertendo os arquivos para o formato **parquet**

In [15]:
from io import BytesIO

for ano, df in df.items():
    parquet_buffer = BytesIO() # Buffer para armazenar o arquivo parquet a ser salvo no S3
    df.to_parquet(parquet_buffer) # Salva o dataframe no buffer como arquivo parquet

    s3.put_object(Bucket='datalake-estudos',
                  Key=f'bronze-camada/dados-{ano}.parquet',
                  Body=parquet_buffer.getvalue()) # Salva o arquivo parquet no S3

In [16]:
response = s3.list_objects(Bucket='datalake-estudos', Prefix='bronze-camada/') # Lista os objetos do bucket bronze-camada

In [18]:
keys = [content['Key'] for content in response['Contents']] # Pega os nomes dos arquivos do bucket
print(keys) # Exibe os nomes dos arquivos do bucket

['bronze-camada/', 'bronze-camada/dados-2019.parquet', 'bronze-camada/dados-2020.parquet', 'bronze-camada/dados-2021.parquet', 'bronze-camada/dados-2022.parquet', 'bronze-camada/dados-2023.parquet', 'bronze-camada/teste-arquivo', 'bronze-camada/teste-arquivo-variaveis']
