In [0]:
#importar as bibliotecas necessárias
from pyspark.sql.types import IntegerType, DoubleType, BooleanType #para transformar string/integer
from pyspark.sql.functions import concat, col, to_date,lpad #para transformar datas


In [0]:
#Configurações necessárias para o acesso aos arquivos
configs = {"fs.azure.account.auth.type": "OAuth",
"fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
"fs.azure.account.oauth2.client.id": "999ca713-67f1-4e1a-b5d2-d236fb92acff",
"fs.azure.account.oauth2.client.secret": 'Joj8Q~Yemo1g._jMUrxploH3pir3K9QtDMGyqaKW', 
"fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/7977b21e-6bc1-4c8f-a881-1b47a52a27cd/oauth2/token"}



dbutils.fs.mount( source = "abfss://amazonia-data@micursode.dfs.core.windows.net", # contrainer@storageacc 
mount_point = "/mnt/amazondb", 
extra_configs = configs)


In [0]:
#verificando o acesso aos diretórios montados

dbutils.fs.ls("/mnt/amazonsp")

Out[4]: [FileInfo(path='dbfs:/mnt/amazonsp/raw-data/', name='raw-data/', size=0, modificationTime=1707695409000),
 FileInfo(path='dbfs:/mnt/amazonsp/transformed-data/', name='transformed-data/', size=0, modificationTime=1707695418000)]

In [0]:
#Carregando os arquivos. Obs: InferSchema = True faz com que na leitura, já exista alguma nível de transformação dos datatypes.

amazon_fires = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/mnt/amazondb/raw-data/amazon_fires.csv") 
amazon_deforestation = spark.read.format("csv").option("header","true").option("inferSchema","true").option("dateFormat", "yyyy-MM-dd").load("/mnt/amazondb/raw-data/deforestation.csv") 
elnino = spark.read.format("csv").option("header","true").option("inferSchema","true").option("dateFormat", "yyyy-MM-dd").load("/mnt/amazondb/raw-data/elnino.csv") 

In [0]:
#Monstrando os dados carregados
amazon_fires.show()
amazon_deforestation.show()
elnino.show()

+----+-----+-----------+-------------------+-------------------+---------+
|year|month|      state|           latitude|          longitude|firespots|
+----+-----+-----------+-------------------+-------------------+---------+
|1999|    1|   AMAZONAS| -2.371113333333333| -59.89993333333334|        3|
|1999|    1|   MARANHAO| -2.257394722222222|-45.487830555555554|       36|
|1999|    1|MATO GROSSO|-12.660633333333333|-55.057988888888886|       18|
|1999|    1|       PARA| -2.474820459770115| -48.54696666666667|       87|
|1999|    1|   RONDONIA|           -12.8617|           -60.5131|        1|
|1999|    1|    RORAIMA| 3.4032246666666666| -60.62285333333333|       15|
|1999|    2|      AMAPA|             -0.155|           -52.6831|        1|
|1999|    2|   AMAZONAS| -2.763166976744186| -63.42978139534884|       43|
|1999|    2|MATO GROSSO|        -12.6199875|        -55.3753625|        8|
|1999|    2|       PARA| -2.150617438596491| -53.50991052631579|      285|
|1999|    2|   RONDONIA| 

In [0]:
#Verificando os data types
amazon_deforestation.printSchema()
amazon_fires.printSchema()
elnino.printSchema()


root
 |-- Ano/Estados: integer (nullable = true)
 |-- AC: integer (nullable = true)
 |-- AM: integer (nullable = true)
 |-- AP: integer (nullable = true)
 |-- MA: integer (nullable = true)
 |-- MT: integer (nullable = true)
 |-- PA: integer (nullable = true)
 |-- RO: integer (nullable = true)
 |-- RR: integer (nullable = true)
 |-- TO: integer (nullable = true)
 |-- AMZ LEGAL: integer (nullable = true)

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- firespots: integer (nullable = true)

root
 |-- start year: integer (nullable = true)
 |-- end year: integer (nullable = true)
 |-- phenomenon: string (nullable = true)
 |-- severity: string (nullable = true)



In [0]:
#Procedendo com as alterações

amazon_deforestation = amazon_deforestation.withColumn("Ano/Estados", to_date(col("Ano/Estados"), "yyyy"))\
.withColumnRenamed("Ano/Estados","Year")\

amazon_deforestation = amazon_deforestation.drop("AMZ LEGAL")

amazon_fires = amazon_fires.withColumn('date', concat(amazon_fires['year'],lpad(amazon_fires['month'], 2, '0')))\
.withColumn("date", to_date(col("date"), "yyyyMM"))


elnino = elnino.withColumn("start year", to_date(col("start year"), "yyyy"))\
.withColumn("end year", to_date(col("end year"), "yyyy"))\
.withColumnRenamed("start year","start_year")\
.withColumnRenamed("end year","end_year")




In [0]:
#Verificando as alterações

amazon_deforestation.printSchema()
amazon_fires.printSchema()
elnino.printSchema()



root
 |-- Year: date (nullable = true)
 |-- AC: integer (nullable = true)
 |-- AM: integer (nullable = true)
 |-- AP: integer (nullable = true)
 |-- MA: integer (nullable = true)
 |-- MT: integer (nullable = true)
 |-- PA: integer (nullable = true)
 |-- RO: integer (nullable = true)
 |-- RR: integer (nullable = true)
 |-- TO: integer (nullable = true)

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- firespots: integer (nullable = true)
 |-- date: date (nullable = true)

root
 |-- start_year: date (nullable = true)
 |-- end_year: date (nullable = true)
 |-- phenomenon: string (nullable = true)
 |-- severity: string (nullable = true)



In [0]:
#Monstrando os dados corrigidos
amazon_fires.show()
amazon_deforestation.show()
elnino.show()

+----+-----+-----------+-------------------+-------------------+---------+----------+
|year|month|      state|           latitude|          longitude|firespots|      date|
+----+-----+-----------+-------------------+-------------------+---------+----------+
|1999|    1|   AMAZONAS| -2.371113333333333| -59.89993333333334|        3|1999-01-01|
|1999|    1|   MARANHAO| -2.257394722222222|-45.487830555555554|       36|1999-01-01|
|1999|    1|MATO GROSSO|-12.660633333333333|-55.057988888888886|       18|1999-01-01|
|1999|    1|       PARA| -2.474820459770115| -48.54696666666667|       87|1999-01-01|
|1999|    1|   RONDONIA|           -12.8617|           -60.5131|        1|1999-01-01|
|1999|    1|    RORAIMA| 3.4032246666666666| -60.62285333333333|       15|1999-01-01|
|1999|    2|      AMAPA|             -0.155|           -52.6831|        1|1999-02-01|
|1999|    2|   AMAZONAS| -2.763166976744186| -63.42978139534884|       43|1999-02-01|
|1999|    2|MATO GROSSO|        -12.6199875|        -5

In [0]:
#Salvando os dados após as transformações
#Repartition = indica em quantas partes o arquivo deve estar dividido
elnino.repartition(1).write.mode("overwrite").option("header","true").csv("/mnt/amazondb/transformed-data/elnino")
amazon_deforestation.repartition(1).write.mode("overwrite").option("header","true").csv("/mnt/amazondb/transformed-data/amazon_deforestation")
amazon_fires.repartition(1).write.mode("overwrite").option("header","true").csv("/mnt/amazondb/transformed-data/amazon_fires")