In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import expr, round, col
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/FootNote.csv"

In [3]:


# Read without header
shoot = spark.read.option("header", True) \
    .csv(hdfs_path)

shoot = shoot.drop(col("_c4")) #Delete Null col
shoot = shoot.withColumn("Year", expr("substring(Year, 3, length(Year))")) #Delete the first and secound chars from the Year String (YR)
shoot = shoot.withColumn("Year", col("Year").cast("int")) #Make the col Year type Integer 

shoot.printSchema()
shoot.show()

customSchema = StructType([
    StructField("CountryCode", StringType(), True),        
    StructField("SeriesCode", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("DESCRIPTION", StringType(), True)
])

shoot \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto.db/FootNote")

root
 |-- CountryCode: string (nullable = true)
 |-- SeriesCode: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- DESCRIPTION: string (nullable = true)

+-----------+--------------------+----+--------------------+
|CountryCode|          SeriesCode|Year|         DESCRIPTION|
+-----------+--------------------+----+--------------------+
|        ABW|         SE.SCH.LIFE|2006|      UIS Estimation|
|        ABW|         SE.PRM.TENR|2006|        UIS estimate|
|        ABW|         SE.PRM.TENR|2014|        UIS estimate|
|        ABW|   SE.PRM.NINT.FE.ZS|2005|        UIS estimate|
|        ABW|      SH.STA.BRTC.ZS|2002|Health situation ...|
|        ABW|      SE.SCH.LIFE.MA|2006|      UIS Estimation|
|        ABW|   SE.ADT.LITR.MA.ZS|2020|      UIS Estimation|
|        ABW|   SE.PRM.NINT.MA.ZS|2005|        UIS estimate|
|        ABW|   SE.ADT.LITR.FE.ZS|2020|      UIS Estimation|
|        ABW|   SE.TER.CMPL.FE.ZS|2002|      UIS Estimation|
|        ABW|   SE.TER.CMPL.FE.ZS|20

In [4]:
spark.sql(
    """
    SELECT * FROM projeto.FootNote
    """
).show()

+-----------+-----------+----+--------------------+
|CountryCode| SeriesCode|Year|         DESCRIPTION|
+-----------+-----------+----+--------------------+
|        LUX|SH.MMR.RISK|2011|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2010|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2009|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2008|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2007|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2006|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2005|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2004|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2003|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2002|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2001|Uncertainty bound...|
|        LUX|SH.MMR.RISK|2000|Uncertainty bound...|
|        LUX|SH.DYN.MORT|2021|Uncertainty bound...|
|        LUX|SH.DYN.MORT|2020|Uncertainty bound...|
|        LUX|SH.DYN.MORT|2019|Uncertainty bound...|
|        LUX|SH.DYN.MORT|2018|Uncertainty bound...|
|        LUX