In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/Country_Series.csv"

In [3]:


# Read without header
shoot = spark.read.option("header", True) \
    .csv(hdfs_path)

shoot = shoot.drop(col("_c3")) #Delete Null col

shoot.printSchema()
shoot.show()

customSchema = StructType([
    StructField("CountryCode", StringType(), True),        
    StructField("SeriesCode", StringType(), True),
    StructField("DESCRIPTION", StringType(), True)
])


shoot \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto.db/Country_Series")

root
 |-- CountryCode: string (nullable = true)
 |-- SeriesCode: string (nullable = true)
 |-- DESCRIPTION: string (nullable = true)

+-----------+-----------------+--------------------+
|CountryCode|       SeriesCode|         DESCRIPTION|
+-----------+-----------------+--------------------+
|        ABW|   SP.DYN.LE00.IN|Data source: Unit...|
|        ABW|      SP.POP.TOTL|Data source: Unit...|
|        ABW|   SP.DYN.CDRT.IN|Data source: Unit...|
|        ABW|SG.POP.MIGR.FE.ZS|Estimates were de...|
|        ABW|SP.DYN.LE00.MA.IN|Data source: Unit...|
|        ABW|   SP.DYN.AMRT.MA|Data source: Unit...|
|        ABW|   SP.DYN.TFRT.IN|Data source: Unit...|
|        ABW|   SP.DYN.CBRT.IN|Data source: Unit...|
|        ABW|   SP.DYN.AMRT.FE|Data source: Unit...|
|        ABW|SP.DYN.LE00.FE.IN|Data source: Unit...|
|        AFG|   SP.DYN.LE00.IN|Data source: Unit...|
|        AFG|   SP.DYN.AMRT.MA|Data source: Unit...|
|        AFG|      SP.POP.TOTL|Data source: Unit...|
|        AFG|SG.PO

In [4]:
spark.sql("""
    SELECT * FROM projeto.Country_Series
""").show()


+-----------+-----------------+--------------------+
|CountryCode|       SeriesCode|         DESCRIPTION|
+-----------+-----------------+--------------------+
|        ABW|   SP.DYN.LE00.IN|Data source: Unit...|
|        ABW|      SP.POP.TOTL|Data source: Unit...|
|        ABW|   SP.DYN.CDRT.IN|Data source: Unit...|
|        ABW|SG.POP.MIGR.FE.ZS|Estimates were de...|
|        ABW|SP.DYN.LE00.MA.IN|Data source: Unit...|
|        ABW|   SP.DYN.AMRT.MA|Data source: Unit...|
|        ABW|   SP.DYN.TFRT.IN|Data source: Unit...|
|        ABW|   SP.DYN.CBRT.IN|Data source: Unit...|
|        ABW|   SP.DYN.AMRT.FE|Data source: Unit...|
|        ABW|SP.DYN.LE00.FE.IN|Data source: Unit...|
|        AFG|   SP.DYN.LE00.IN|Data source: Unit...|
|        AFG|   SP.DYN.AMRT.MA|Data source: Unit...|
|        AFG|      SP.POP.TOTL|Data source: Unit...|
|        AFG|SG.POP.MIGR.FE.ZS|Estimates were de...|
|        AFG|SP.DYN.LE00.MA.IN|Data source: Unit...|
|        AFG|   SP.DYN.CBRT.IN|Data source: Un