In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

#spark =
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql("CREATE DATABASE IF NOT EXISTS projeto Location 'hdfs://hdfs-nn:9000/warehouse/projeto.db/'")

DataFrame[]

In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.Hiv_NY
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.Hiv_NY (
        Year INT,
        Borough String,
        UHF String,
        Gender String,
        Age String,
        Race String,
        HIV_diagnoses INT,
        HIV_diagnosis_rate Double,
        AIDS_diagnoses INT,
        viral_suppression_percent Double,
        Deaths INT,
        HIV_related_death_rate Double
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/Hiv_NY/'
    """
)

DataFrame[]

In [4]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.FootNote
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.FootNote (
        CountryCode String,
        SeriesCode String,
        Year INT,
        DESCRIPTION String
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/FootNote/'
    """
)

DataFrame[]

In [5]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.Country_Series
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.Country_Series(
        CountryCode String,
        SeriesCode String,
        DESCRIPTION String
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/Country_Series/'
    """
)

DataFrame[]

In [6]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.Stats_Series
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.Stats_Series (
        Series_code String,
        Topic String,
        Indicator_Name String,
        Short_definition String,
        Long_definition String,
        Unit_of_measure String,
        Periodicity String
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/Stats_Series/'
    """
)

DataFrame[]

In [7]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.Education_State
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.Education_State (
        State String,
        Men_Less_than_High_School_Diploma float,
        Women_Less_than_High_School_Diploma float,
        Men_High_School_Diploma float,
        Women_High_School_Diploma float,
        Men_College_or_Associate_Diploma float,
        Women_College_or_Associate_Diploma float,
        Men_Bachelor_Diploma_or_Higher float,
        Women_Bachelor_Diploma_or_Higher float
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/Education_State'
    """
)


DataFrame[]

In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.StatsSeries_Time
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.StatsSeries_Time (
        SeriesCode String,
        Year INT,
        Description String
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/StatsSeries_Time'
    """
)


DataFrame[]

In [9]:
spark.sql(
    """
    SHOW TABLES FROM projeto
    """
).toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,projeto,country_series,False
1,projeto,education_ny,False
2,projeto,education_state,False
3,projeto,educationny,False
4,projeto,footnote,False
5,projeto,gender_labourmarket,False
6,projeto,gender_statscountry,False
7,projeto,gender_statsseries,False
8,projeto,gender_statsseries_time,False
9,projeto,genderandlabormarketdataset,False


In [14]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.Stats_Country
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.Stats_Country (
         Country_Code STRING,
         Short_Name STRING,
         Table_Name STRING,
         Long_name STRING,
         2_alpha_code STRING,
         Currency_Unit STRING,
         Special_Notes STRING,
         Region STRING,
         Income_Group STRING,
         WB_2_code STRING,
         Nacional_accounts_base_year STRING,
         Nacional_accounts_reference_year STRING,  -- Modified to STRING
         SNA_price_valuation STRING,
         Lending_category STRING,
         Other_groups STRING,
         System_of_National_Accounts STRING,
         Balance_of_Payments_Manual_in_use STRING,
         External_debt_Reporting_status STRING,
         System_of_trade STRING, 
         Government_Accounting_concept STRING,
         IMF_data_dissemination_standard STRING,
         Latest_population_census STRING,
         Latest_household_survey STRING,
         Source_of_most_recent_Income_and_expenditure_data STRING,
         Vital_registration_complete STRING,
         Latest_agricultural_census STRING,
         Latest_industrial_census STRING,  -- Modified to STRING
         Latest_trade_data STRING  -- Modified to STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/Stats_Country'
    """
)


DataFrame[]

In [None]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.Stats_Data
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.Stats_Data (
   country_name String,
   country_code String,
   indicator_name String,
   indicator_code String,
   `1960` double,
   `1961` double,
   `1962` double,
   `1963` double,
   `1964` double,
   `1965` double,
   `1966` double,
   `1967` double,
   `1968` double,
   `1969` double,
   `1970` double,
   `1971` double,
   `1972` double,
   `1973` double,
   `1974` double,
   `1975` double,
   `1976` double,
   `1977` double,
   `1978` double,
   `1979` double,
   `1980` double,
   `1981` double,
   `1982` double,
   `1983` double,
   `1984` double,
   `1985` double,
   `1986` double,
   `1987` double,
   `1988` double,
   `1989` double,
   `1990` double,
   `1991` double,
   `1992` double,
   `1993` double,
   `1994` double,
   `1995` double,
   `1996` double,
   `1998` double,
   `1999` double,
   `2000` double,
   `2001` double,
   `2002` double,
   `2003` double,
   `2004` double,
   `2005` double,
   `2006` double,
   `2007` double,
   `2008` double,
   `2009` double,
   `2010` double,
   `2011` double,
   `2012` double,
   `2013` double,
   `2014` double,
   `2015` double,
   `2016` double,
   `2017` double,
   `2018` double,
   `2019` double,
   `2020` double,
   `2021` double,
   `2022` double
)
USING DELTA
LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/Stats_Data/'

    """
)

In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.EducationNY
    """
)
spark.sql(
    """
CREATE EXTERNAL TABLE projeto.EducationNY (
    DBN String,
    SchoolName String,
    CohortYear INT,
    CohortCategory String,
    Gender String, 
    TotalCohortNum INT, 
    TotalGradsNum INT, 
    TotalGradsPctOfCohort String, 
    TotalRegentsNum INT,
    TotalRegentsPctOfCohort String,
    TotalRegentsPctOfGrads String, 
    AdvancedRegentsNum INT, 
    AdvancedRegentsPctOfCohort String,
    AdvancedRegentsPctOfGrads String,
    RegentsWoAdvancedNum INT,
    RegentsWoAdvancedPctOfCohort String,
    RegentsWoAdvancedPctOfGrads String,
    LocalNum INT,
    LocalPctOfCohort String,
    LocalPctOfGrads String,
    StillEnrolledNum INT,
    StillEnrolledPctOfCohort String,
    DroppedOutNum INT, 
    DroppedOutPctOfCohort String,
    Borough String
)
USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/EducationNY/'
    """
)

DataFrame[]

In [4]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.ShootingsNY
    """
)
spark.sql(
    """
CREATE EXTERNAL TABLE projeto.ShootingsNY (
    Area INT,
    Jurisdiction_Code INT,
    X_Coord_Cd INT,
    Y_Coord_Cd INT,
    Incident_Key STRING,
    Occur_Date STRING,
    Occur_Time STRING,
    Borough STRING,
    Location STRING,
    Location_Classification STRING,
    Location_Desc STRING,
    Statistical_Murder_Flag STRING,
    Perp_Age_Group STRING,
    Gender STRING,
    Perp_Race STRING,
    Vic_Age_Group STRING,
    Vic_Sex STRING,
    Vic_Race STRING,
    Longitude STRING,
    Latitude STRING,
    New_Georeferenced_Column STRING
)
USING DELTA
LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/Shootings_NY/'
"""
)

DataFrame[]

In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.EarningsUS
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.EarningsUS (
        State String,
        Male Double,
        Female Double
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto.db/EarningsUS/'
    """
)

DataFrame[]