In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import expr, round, col
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/StatsSeries_Time.csv"

In [3]:
# Read without header
stats = spark.read.option("header", True) \
    .csv(hdfs_path)

stats = stats.withColumnRenamed("DESCRIPTION","Description")

stats = stats.drop("_c3")

stats = stats.withColumn("Year", expr("substring(Year, 3, length(Year))")) #Delete the first and secound chars from the Year String (YR)
stats = stats.withColumn("Year", col("Year").cast("int")) #Make the col Year type Integer 

stats.printSchema()
stats.show()
#stats.toPandas() 
#Usar o comando acima para uma melhor pré-visualização da tabela

customSchema = StructType([
    StructField("SeriesCode", StringType(), True),        
    StructField("Year", IntegerType(), True),
    StructField("Description", StringType(), True),
])

stats \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto.db/StatsSeries_Time")

root
 |-- SeriesCode: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Description: string (nullable = true)

+-----------------+----+--------------------+
|       SeriesCode|Year|         Description|
+-----------------+----+--------------------+
|SP.DYN.LE60.MA.IN|1962|The data refer to...|
|SP.DYN.LE60.FE.IN|1962|The data refer to...|
|SP.DYN.LE60.MA.IN|1967|The data refer to...|
|SP.DYN.LE60.FE.IN|1967|The data refer to...|
|SP.DYN.LE60.MA.IN|1972|The data refer to...|
|SP.DYN.LE60.FE.IN|1972|The data refer to...|
|SP.DYN.LE60.MA.IN|1977|The data refer to...|
|SP.DYN.LE60.FE.IN|1977|The data refer to...|
|SP.DYN.LE60.MA.IN|1982|The data refer to...|
|SP.DYN.LE60.FE.IN|1982|The data refer to...|
|SP.DYN.LE60.MA.IN|1987|The data refer to...|
|SP.DYN.LE60.FE.IN|1987|The data refer to...|
|      SP.UWT.TFRT|1990|Averages for regi...|
|      SP.DYN.WFRT|1990|Averages for regi...|
|   SH.FPL.SATM.ZS|1990|Averages for regi...|
|   SH.STA.ANV4.ZS|1990|Averages for regi...

In [5]:
spark.sql(
    """
    SELECT * FROM projeto.StatsSeries_Time
    """
).show()

+-----------------+----+--------------------+
|       SeriesCode|Year|         Description|
+-----------------+----+--------------------+
|SP.DYN.LE60.MA.IN|1962|The data refer to...|
|SP.DYN.LE60.FE.IN|1962|The data refer to...|
|SP.DYN.LE60.MA.IN|1967|The data refer to...|
|SP.DYN.LE60.FE.IN|1967|The data refer to...|
|SP.DYN.LE60.MA.IN|1972|The data refer to...|
|SP.DYN.LE60.FE.IN|1972|The data refer to...|
|SP.DYN.LE60.MA.IN|1977|The data refer to...|
|SP.DYN.LE60.FE.IN|1977|The data refer to...|
|SP.DYN.LE60.MA.IN|1982|The data refer to...|
|SP.DYN.LE60.FE.IN|1982|The data refer to...|
|SP.DYN.LE60.MA.IN|1987|The data refer to...|
|SP.DYN.LE60.FE.IN|1987|The data refer to...|
|      SP.UWT.TFRT|1990|Averages for regi...|
|      SP.DYN.WFRT|1990|Averages for regi...|
|   SH.FPL.SATM.ZS|1990|Averages for regi...|
|   SH.STA.ANV4.ZS|1990|Averages for regi...|
|   SH.STA.ANVC.ZS|1990|Averages for regi...|
|   SH.STA.BRTC.ZS|1990|Averages for regi...|
|   SP.DYN.CONM.ZS|1990|Averages f