## Overview

This notebook shows you how to create and query a table or DataFrame loaded from data stored in AWS S3. There are two ways to establish access to S3: [IAM roles](https://docs.databricks.com/user-guide/cloud-configurations/aws/iam-roles.html) and access keys.

*We recommend using IAM roles to specify which cluster can access which buckets. Keys can show up in logs and table metadata and are therefore fundamentally insecure.* If you do use keys, you'll have to escape the `/` in your keys with `%2F`.

This is a **Python** notebook so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` magic command. Python, Scala, SQL, and R are all supported.

In [0]:
spark.sql("SET spark.databricks.delta.properties.defaults.minWriterVersion = 5")
spark.sql("SET spark.databricks.delta.properties.defaults.minReaderVersion = 2")
spark.sql("SET spark.databricks.delta.schema.autoMerge.enabled = true")

Out[1]: DataFrame[key: string, value: string]

In [0]:
# File location and type
file_location = "s3a://uutrase/data/trase-uppsala/brazil/CNPJ_2019.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ";"

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option("encoding", "UTF-8") \
  .option("ignoreLeadingWhiteSpace", "true") \
  .option("ignoreTrailingWhiteSpace", "true") \
  .option("nullvalue", "NA") \
  .option("multiLine", "true") \
  .load(file_location)

In [0]:
df.createOrReplaceTempView('df_table')
spark.sql("CREATE TABLE IF NOT EXISTS trase_catalog.brazil.cnpj_2019 \
    USING DELTA \
    SELECT * from df_table \
    WHERE 1=2")

df.write.format("delta").mode("append").insertInto("trase_catalog.brazil.cnpj_2019")
#df.write.format("delta").option("mergeSchema", "true").mode("append").insertInto("trase_catalog.brazil.cnpj_2019_silver")

In [0]:
display(spark.table("trase_catalog.brazil.cnpj_2019"))

municipio,uf,tipo_de_registro,indicador,tipo_atualizacao,cnpj,identificador_matriz_filial,razao_social,nome_fantasia,situacao_cadastral,data_situacao_cadastral,motivo_situacao_cadastral,codigo_natureza_juridica,data_inicio_atividade,cnae_primary,descricao_tipo_logradouro,logradouro,numero,complemento,bairro,cep,ddd_telefone_1,ddd_telefone_2,ddd_fax,correio_eletronico,qualificacao_responsavel,capital_social_empresa,porte_empresa,opcao_pelo_simples,opcao_pelo_mei,geocodmun,uf_cod,type
ABADIA DE GOIAS,GO,1,False,,19930203000105,1,DOMINGOS RIBEIRO DE SOUZA 17309972104,SOUZA PINTURAS LTDA,2,2014-03-23,0,2135,2014-03-23,4330404,RUA,NA-1 QUADRA 07 LOTE 35,SN,,JARDIM NOVA ABADIA,75345000,62 96736246,,,lfmcontabilidade@terra.com.br,50,1.0,1,5,S,5200050,52,cnpj
ABADIA DE GOIAS,GO,1,False,,22908603000129,1,ETIENY DE MELO QUEIROZ 05084268128,PRE MOLDADOS ABADIA,8,2018-11-19,1,2135,2015-07-23,4744099,AVENIDA,Paineiras,sn,QUADRA 14 LOTE 21/22,Residencial Paineiras,75345000,62 81449970,,,luizfadm@bol.com.br,50,5000.0,1,6,S,5200050,52,cnpj
ABADIA DE GOIAS,GO,1,False,,3291465000137,1,RAVENNA LOCACAO E PARTICIPACOES LTDA,MECAT,2,2005-01-08,0,2062,1987-05-08,6810202,RODOVIA,060 KM 213 + 150 MTS,SN,BLOCO A,SETOR CENTRAL,75345000,62 5031155,,62 5031112,mecat@zaz.com.br,49,1330425.0,1,0,N,5200050,52,cnpj
ABADIA DE GOIAS,GO,1,False,,24226952000196,1,FUNERARIA APAX SERVICOS POSTUMOS EIRELI,FUNERARIA APAX,2,2016-02-19,0,2305,2016-02-19,9603304,AVENIDA,COMERCIAL,S/N,QUADRA01 LOTE 05,CENTRO,75345000,62 40169330,62 92508770,,,65,88000.0,1,5,N,5200050,52,cnpj
ABADIA DE GOIAS,GO,1,False,,18381696000109,1,ENGEAB CONSTRUCOES LTDA,ENGEAB CONSTRUCOES,8,2015-07-23,1,2062,2013-01-22,4299599,AVENIDA,COMERCIAL,SN,QUADRA 01 LOTE 1E SALA 1,PARQUE IZABEL,75345000,62 35031185,,,,49,300000.0,1,6,N,5200050,52,cnpj
ABADIA DE GOIAS,GO,1,False,,29961764000125,1,APARECIDA CRISTINA VIEIRA DE JESUS 02309315109,M&C CONFECCOES,2,2018-03-16,0,2135,2018-03-16,1412601,RUA,rua gabriel alves area 02,02,ANEXO cepac,parque izabel,75345000,62 99196637,,,marcoabruno19@gmail.com,50,6000.0,1,5,S,5200050,52,cnpj
ABADIA DE GOIAS,GO,1,False,,16138907000151,1,ELEICAO 2012 SANDRA VIEIRA DE SANTANA VEREADOR,,8,2012-12-31,1,4090,2012-07-06,9492800,RUA,JOSE FIRMINO CAMPOS,00,CHACARA BOUGANVILE,SETOR DOM FELIPE,75345000,0 0,,0 0,psdb_psb2012@hotmail.com,51,0.0,5,0,N,5200050,52,cnpj
ABADIA DE GOIAS,GO,1,False,,30568100000180,1,LIMPMIL GESTAO E TRATAMENTO DE RESIDUOS EIRELI,LIMPMIL AMBIENTAL GESTAO E TRATAMENTO DE RESIDUOS,2,2018-05-28,0,2305,2018-05-28,3822000,RUA,ALMINDA GALDINO DE OLIVEIRA,S/N,QUADRAAPM LOTE 5 B,MARIA OLIVEIRA,75345000,62 91690604,,,,65,150000.0,1,5,N,5200050,52,cnpj
ABADIA DE GOIAS,GO,1,False,,33532310000106,1,DANIEL DE OLIVEIRA COMERCIO DE BEBIDAS,ATACADAO DE BEBIDAS,2,2019-05-06,0,2135,2019-05-06,4635499,AVENIDA,COMERCIAL,SN,QUADRAA LOTE 4-A,VILA GOIANY,75345000,62 84489221,,,FOKUSCONTABILIDADE@HOTMAIL.COM,50,10000.0,1,0,N,5200050,52,cnpj
ABADIA DE GOIAS,GO,1,False,,24196090000104,1,EDSON TRANSPORTES EIRELI,EDSON TRANSPORTES,2,2016-02-17,0,2305,2016-02-17,4930202,RUA,3,0,KM CHACARA 1A SALA 02,VILA NOSSA SENHORA DA GUIA,75345000,62 39326526,62 85798775,62 39326526,CONTERDATACONTABILIDADE@GMAIL.COM,65,90000.0,1,5,N,5200050,52,cnpj


In [0]:
%sql
UPDATE cnpj_2019 SET cnpj=08069508000239 WHERE cnpj='01654467812CNPJ:08069508000239'; -- Affected rows: 1
UPDATE cnpj_2019 SET cnpj=15184499000101 WHERE cnpj='151844990001*01'; -- Affected rows: 1
SELECT * FROM cnpj_2019 WHERE uf_cod NOT REGEXP '^[0-9]+$' -- Save this 41 records for cleaning (bad escape char)
DELETE FROM cnpj_2019 WHERE uf_cod NOT REGEXP '^[0-9]+$' -- Deleted unclean records
SELECT * FROM cnpj_2019 WHERE porte_empresa NOT REGEXP '^[0-9]+$' -- Save this 28 records for cleaning
SELECT * FROM cnpj_2019 WHERE geocodmun NOT REGEXP '^[0-9]+$' -- Save geocodmun from exterior places
UPDATE cnpj_2019 SET geocodmun=0 WHERE geocodmun NOT REGEXP '^[0-9]+$' -- Set to 0 so to change the type


In [0]:
# Needed to use col('column_name')
import pyspark.sql.functions as F

In [0]:
(spark.read.table("trase_catalog.brazil.cnpj_2019")
  .withColumn("tipo_de_registro", F.col("tipo_de_registro").cast("tinyint"))
  .withColumn("indicador", F.col("indicador").cast("boolean"))
  .withColumn("tipo_atualizacao", F.col("tipo_atualizacao").cast("tinyint"))
  .withColumn("cnpj", F.col("cnpj").cast("bigint"))
  .withColumn("identificador_matriz_filial", F.col("identificador_matriz_filial").cast("tinyint"))
  .withColumn("codigo_natureza_juridica", F.col("codigo_natureza_juridica").cast("int"))
  .withColumn("cnae_primary", F.col("cnae_primary").cast("int"))
  .withColumn("porte_empresa", F.col("porte_empresa").cast("int"))
  .withColumn("opcao_pelo_simples", F.col("opcao_pelo_simples").cast("int"))
  .withColumn("geocodmun", F.col("geocodmun").cast("int"))
  .withColumn("uf_cod", F.col("uf_cod").cast("int"))
  .write
  .mode("overwrite")
  .option("overwriteSchema", "true")
  .saveAsTable("trase_catalog.brazil.cnpj_2019")
)

In [0]:
%sql
OPTIMIZE trase_catalog.brazil.cnpj_2019
ZORDER BY (cnae_primary)

path,metrics
s3://uu-trase-delta/c8020c97-1eec-4c2d-9870-afd022e6c462/tables/5f7c7dad-fed5-4288-a842-61ed76ff099b,"List(47, 25, List(34388201, 83275479, 6.794106234042554E7, 47, 3193229930), List(27246098, 132062199, 1.2678199924E8, 25, 3169549981), 0, List(minCubeSize(107374182400), List(0, 0), List(25, 3169549981), 0, List(25, 3169549981), 1, null), 1, 25, 0, false, 0, 0, 1671205315592, 1671206188639, 2, 1, null)"


This way is also valid
```
df = spark.read.table("trase_catalog.brazil.cnpj_2019")

df = df.withColumn("tipo_de_registro", F.col("tipo_de_registro").cast("tinyint"))
df = df.withColumn("indicador", F.col("indicador").cast("boolean"))
df = df.withColumn("tipo_atualizacao", F.col("tipo_atualizacao").cast("tinyint"))
df = df.withColumn("cnpj", F.col("cnpj").cast("bigint"))
df = df.withColumn("identificador_matriz_filial", F.col("identificador_matriz_filial").cast("tinyint"))
df = df.withColumn("codigo_natureza_juridica", F.col("codigo_natureza_juridica").cast("int"))
df = df.withColumn("cnae_primary", F.col("cnae_primary").cast("int"))
df = df.withColumn("porte_empresa", F.col("porte_empresa").cast("int"))
df = df.withColumn("opcao_pelo_simples", F.col("opcao_pelo_simples").cast("int"))
df = df.withColumn("geocodmun", F.col("geocodmun").cast("int"))
df = df.withColumn("uf_cod", F.col("uf_cod").cast("int"))

# Overwrite the table with the updated df ()
df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").insertInto("trase_catalog.brazil.CNPJ_2019")
````