In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import col, when
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/Stats_Series.csv"

In [3]:
shoot = spark.read.option("header", True) \
                  .option("multiline", True) \
                  .csv(hdfs_path)

shoot = shoot.drop(col("Base period"))
shoot = shoot.drop(col("Other Notes"))
shoot = shoot.drop(col("Aggregation method"))
shoot = shoot.drop(col("Limitations and exceptions"))
shoot = shoot.drop(col("Notes from original source"))
shoot = shoot.drop(col("General comments"))
shoot = shoot.drop(col("Source"))
shoot = shoot.drop(col("Statistical concept and methodology"))
shoot = shoot.drop(col("Development relevance"))
shoot = shoot.drop(col("Related source links"))
shoot = shoot.drop(col("Other web links"))
shoot = shoot.drop(col("Related indicators"))
shoot = shoot.drop(col("License Type"))
shoot = shoot.drop(col("_c20"))

shoot = shoot.withColumnRenamed("Series code", "Series_code")
shoot = shoot.withColumnRenamed("Indicator Name", "Indicator_Name")
shoot = shoot.withColumnRenamed("Short definition", "Short_definition")
shoot = shoot.withColumnRenamed("Long definition", "Long_definition")
shoot = shoot.withColumnRenamed("Unit of measure", "Unit_of_measure")

relevant_columns = ["Long_definition", "Unit_of_measure", "Periodicity"]
for col_name in relevant_columns:
    shoot = shoot.na.fill("", subset=col_name)

customSchema = StructType([
    StructField("Series_code", StringType(), True),
    StructField("Topic", StringType(), True),
    StructField("Indicator_Name", StringType(), True), 
    StructField("Short_definition", StringType(), True),
    StructField("Long_definition", StringType(), True),
    StructField("Unit_of_measure", StringType(), True),
    StructField("Periodicity", StringType(), True),
])

shoot.printSchema()
shoot.show()
shoot.toPandas() 

shoot \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto.db/Stats_Series")

root
 |-- Series_code: string (nullable = true)
 |-- Topic: string (nullable = true)
 |-- Indicator_Name: string (nullable = true)
 |-- Short_definition: string (nullable = true)
 |-- Long_definition: string (nullable = false)
 |-- Unit_of_measure: string (nullable = false)
 |-- Periodicity: string (nullable = false)

+-------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  Series_code| Topic|      Indicator_Name|    Short_definition|     Long_definition|     Unit_of_measure|         Periodicity|
+-------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  account.t.d|Assets| Account (% age 15+)|"The percentage o...|"The percentage o...|             Percent|           Triennial|
|account.t.d.1|Assets|Account, female (...|"The percentage o...| female (% age 15+)"|"The percentage o...| female (% age 15+)"|
|account.t.d.2|Assets|Account, male (% .

In [4]:
spark.sql(
    """
    SELECT * FROM projeto.Stats_Series
    """
).show()

+-------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  Series_code| Topic|      Indicator_Name|    Short_definition|     Long_definition|     Unit_of_measure|         Periodicity|
+-------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  account.t.d|Assets| Account (% age 15+)|"The percentage o...|"The percentage o...|             Percent|           Triennial|
|account.t.d.1|Assets|Account, female (...|"The percentage o...| female (% age 15+)"|"The percentage o...| female (% age 15+)"|
|account.t.d.2|Assets|Account, male (% ...|"The percentage o...|   male (% age 15+)"|"The percentage o...|   male (% age 15+)"|
|   borrow.any|Assets|Borrowed any mone...|The percentage of...|The percentage of...|             Percent|           Triennial|
| borrow.any.1|Assets|Borrowed any mone...|The percentage of...|The percentage of...|             Percen