# Setup

In [None]:
%idle_timeout 10
%timeout 10
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

In [None]:
import boto3
import os, sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [None]:
# Define data type conversion functions

def cast_numeric_cols(df, numeric_cols):
    for col_name in numeric_cols:
        df = df.withColumn(col_name, regexp_replace(col_name, ',', '.').cast("float"))
    
    return df


def cast_date_cols(df, date_cols, date_fmt):
    for col_name in date_cols:
        df = df.withColumn(col_name, to_date(col_name, date_fmt))
    
    return df

## Taxas Tesouro

In [None]:
df = spark.read.csv("s3://tesouro-landing/PrecoTaxaTesouroDireto.csv", header=True, sep=";")
# df.printSchema()

In [None]:
# Convert numerical and date columns
num_cols = df.columns[-5:]
date_cols = [_ for _ in df.columns if _.lower().startswith('data')]

df = cast_numeric_cols(df, num_cols)
df = cast_date_cols(df, date_cols, "dd/MM/yyyy")

# Write dataframe to parquet
df.write.parquet("s3://tesouro-bronze/taxa_tesouro_direto.parquet")

## Operacoes Tesouro

In [None]:
df = spark.read.csv("s3://tesouro-landing/OperacoesTesouroDireto.csv", header=True, sep=";")
# df.printSchema()

In [None]:
# Convert numerical and date columns
num_cols = ["Quantidade", "Valor do Titulo", "Valor da Operacao"]
date_cols = ["Data da Operacao", "Vencimento do Titulo"]

df = cast_numeric_cols(df, num_cols)
df = cast_date_cols(df, date_cols, "dd/MM/yyyy")

# Write dataframe to parquet
df.write.parquet("s3://tesouro-bronze/operacoes_tesouro_direto.parquet")

## Investidores Tesouro

In [None]:
s3 = boto3.client('s3')

# Define bucket and prefix
bucket_name = "tesouro-landing"
prefix = "InvestidoresTesouroDireto"

# List objects in the bucket with the specific prefix
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
files = [f"s3a://{bucket_name}/{obj['Key']}" for obj in response.get('Contents', [])]

In [None]:
df = spark.read.csv(files, header=True, sep=";")
# df.printSchema()

In [None]:
# Convert numerical and date columns
date_cols = ["Data de Adesao"]

df = cast_date_cols(df, date_cols, "dd/MM/yyyy")
df = df.withColumn('Idade', col('Idade').cast('integer'))

# Write dataframe to parquet
df.write.parquet("s3://tesouro-bronze/investidores_tesouro_direto.parquet")