In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import when, col

#warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/Stats_Data.csv"

In [3]:
stats = spark.read.csv(hdfs_path, header=True, inferSchema=True)

In [4]:
stats.printSchema()
stats.show()
stats.toPandas()

root
 |-- Country Name: string (nullable = true)
 |-- Country Code: string (nullable = true)
 |-- Indicator Name: string (nullable = true)
 |-- Indicator Code: string (nullable = true)
 |-- 1960: double (nullable = true)
 |-- 1961: double (nullable = true)
 |-- 1962: double (nullable = true)
 |-- 1963: double (nullable = true)
 |-- 1964: double (nullable = true)
 |-- 1965: double (nullable = true)
 |-- 1966: double (nullable = true)
 |-- 1967: double (nullable = true)
 |-- 1968: double (nullable = true)
 |-- 1969: double (nullable = true)
 |-- 1970: double (nullable = true)
 |-- 1971: double (nullable = true)
 |-- 1972: double (nullable = true)
 |-- 1973: double (nullable = true)
 |-- 1974: double (nullable = true)
 |-- 1975: double (nullable = true)
 |-- 1976: double (nullable = true)
 |-- 1977: double (nullable = true)
 |-- 1978: double (nullable = true)
 |-- 1979: double (nullable = true)
 |-- 1980: double (nullable = true)
 |-- 1981: double (nullable = true)
 |-- 1982: double (null

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,_c67
0,Africa Eastern and Southern,AFE,A woman can apply for a passport in the same w...,SG.APL.PSPT.EQ,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,A woman can be head of household in the same w...,SG.HLD.HEAD.EQ,,,,,,,...,,,,,,,,,,
2,Africa Eastern and Southern,AFE,A woman can choose where to live in the same w...,SG.LOC.LIVE.EQ,,,,,,,...,,,,,,,,,,
3,Africa Eastern and Southern,AFE,A woman can get a job in the same way as a man...,SG.GET.JOBS.EQ,,,,,,,...,,,,,,,,,,
4,Africa Eastern and Southern,AFE,A woman can obtain a judgment of divorce in th...,SG.OBT.DVRC.EQ,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305540,Zimbabwe,ZWE,Worried about not having enough money for old ...,fin44a1.d.2,,,,,,,...,,,,,,,,68.02,,
305541,Zimbabwe,ZWE,"Youth illiterate population, 15-24 years, % fe...",UIS.LPP.AG15T24,,,,,,,...,37.14571,,,,,,,,,
305542,Zimbabwe,ZWE,"Youth illiterate population, 15-24 years, both...",UIS.LP.AG15T24,,,,,,,...,267220.00000,,,,,,,,,
305543,Zimbabwe,ZWE,"Youth illiterate population, 15-24 years, fema...",UIS.LP.AG15T24.F,,,,,,,...,99261.00000,,,,,,,,,


In [5]:
stats = stats.drop( "_c67") 

In [6]:
stats = stats.withColumnRenamed("Country Name", "Country_Name")
stats = stats.withColumnRenamed("Country Code", "Country_Code")
stats = stats.withColumnRenamed("Indicator Name", "Indicator_Name")
stats = stats.withColumnRenamed("Indicator Code", "Indicator_Code")

In [7]:
year_columns = [str(year) for year in range(1961, 2022)]

# Replace null values with "Unknown" for all year columns
for column_name in year_columns:
    stats = stats.withColumn(
        column_name,
        when(
            col(column_name).isNull(),
            "Unknown"
        ).otherwise(col(column_name))
    )

In [8]:
stats.printSchema()
stats.show()
stats.toPandas()

root
 |-- Country_Name: string (nullable = true)
 |-- Country_Code: string (nullable = true)
 |-- Indicator_Name: string (nullable = true)
 |-- Indicator_Code: string (nullable = true)
 |-- 1960: double (nullable = true)
 |-- 1961: string (nullable = true)
 |-- 1962: string (nullable = true)
 |-- 1963: string (nullable = true)
 |-- 1964: string (nullable = true)
 |-- 1965: string (nullable = true)
 |-- 1966: string (nullable = true)
 |-- 1967: string (nullable = true)
 |-- 1968: string (nullable = true)
 |-- 1969: string (nullable = true)
 |-- 1970: string (nullable = true)
 |-- 1971: string (nullable = true)
 |-- 1972: string (nullable = true)
 |-- 1973: string (nullable = true)
 |-- 1974: string (nullable = true)
 |-- 1975: string (nullable = true)
 |-- 1976: string (nullable = true)
 |-- 1977: string (nullable = true)
 |-- 1978: string (nullable = true)
 |-- 1979: string (nullable = true)
 |-- 1980: string (nullable = true)
 |-- 1981: string (nullable = true)
 |-- 1982: string (null

Unnamed: 0,Country_Name,Country_Code,Indicator_Name,Indicator_Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Africa Eastern and Southern,AFE,A woman can apply for a passport in the same w...,SG.APL.PSPT.EQ,,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,
1,Africa Eastern and Southern,AFE,A woman can be head of household in the same w...,SG.HLD.HEAD.EQ,,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,
2,Africa Eastern and Southern,AFE,A woman can choose where to live in the same w...,SG.LOC.LIVE.EQ,,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,
3,Africa Eastern and Southern,AFE,A woman can get a job in the same way as a man...,SG.GET.JOBS.EQ,,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,
4,Africa Eastern and Southern,AFE,A woman can obtain a judgment of divorce in th...,SG.OBT.DVRC.EQ,,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305540,Zimbabwe,ZWE,Worried about not having enough money for old ...,fin44a1.d.2,,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,68.02,
305541,Zimbabwe,ZWE,"Youth illiterate population, 15-24 years, % fe...",UIS.LPP.AG15T24,,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,37.14571,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,
305542,Zimbabwe,ZWE,"Youth illiterate population, 15-24 years, both...",UIS.LP.AG15T24,,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,267220.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,
305543,Zimbabwe,ZWE,"Youth illiterate population, 15-24 years, fema...",UIS.LP.AG15T24.F,,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,99261.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,


In [9]:
stats \
    .write \
    .option("overwriteSchema", "true")\
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/projeto/silver/gender.db/Stats_Data/")