In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import expr, round, col
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType, FloatType

warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/EducationUS.csv"

In [8]:
from pyspark.sql import functions as F

# Read without header
education = spark.read.option("header", True) \
    .csv(hdfs_path)

education = education.withColumn("Less than a High School Diploma (Women)", col("Less than a High School Diploma (Women)").cast("float"))
education = education.withColumn("Less than a High School Diploma (Men)", col("Less than a High School Diploma (Men)").cast("float"))
education = education.withColumn("High School Diploma or the Equivalent Only (Women)", col("High School Diploma or the Equivalent Only (Women)").cast("float")) 
education = education.withColumn("High School Diploma or the Equivalent Only (Men)", col("High School Diploma or the Equivalent Only (Men)").cast("float")) 
education = education.withColumn("Some College or an Associate's Degree (Women)", col("Some College or an Associate's Degree (Women)").cast("float")) 
education = education.withColumn("Some College or an Associate's Degree (Men)", col("Some College or an Associate's Degree (Men)").cast("float")) 
education = education.withColumn("Bachelor's Degree or Higher (Women)", col("Bachelor's Degree or Higher (Women)").cast("float")) 
education = education.withColumn("Bachelor's Degree or Higher (Men)", col("Bachelor's Degree or Higher (Men)").cast("float")) 

education = education.withColumnRenamed("Less than a High School Diploma (Men)", "Men_Less_than_High_School_Diploma")
education = education.withColumnRenamed("Less than a High School Diploma (Women)", "Women_Less_than_High_School_Diploma")
education = education.withColumnRenamed("High School Diploma or the Equivalent Only (Men)", "Men_High_School_Diploma")
education = education.withColumnRenamed("High School Diploma or the Equivalent Only (Women)", "Women_High_School_Diploma")
education = education.withColumnRenamed("Some College or an Associate's Degree (Men)", "Men_College_or_Associate_Diploma")
education = education.withColumnRenamed("Some College or an Associate's Degree (Women)", "Women_College_or_Associate_Diploma")
education = education.withColumnRenamed("Bachelor's Degree or Higher (Men)", "Men_Bachelor_Diploma_or_Higher")
education = education.withColumnRenamed("Bachelor's Degree or Higher (Women)", "Women_Bachelor_Diploma_or_Higher")


education.printSchema()
education.show()
#education.toPandas() 
#Usar o comando acima para uma melhor pré-visualização da tabela

customSchema = StructType([
    StructField("Country", StringType(), True),
    StructField("State", StringType(), True),        
    StructField("Men_Less_than_High_School_Diploma", FloatType(), True),        
    StructField("Women_Less_than_High_School_Diploma", FloatType(), True),
    StructField("Men_High_School_Diploma", FloatType(), True),
    StructField("Women_High_School_Diploma", FloatType(), True),
    StructField("Men_College_or_Associate_Diploma", FloatType(), True),
    StructField("Women_College_or_Associate_Diploma", FloatType(), True),
    StructField("Men_Bachelor_Diploma_or_Higher", FloatType(), True),
    StructField("Women_Bachelor_Diploma_or_Higher", FloatType(), True),

])

education \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto.db/Education_State")


root
 |-- State: string (nullable = true)
 |-- Women_Less_than_High_School_Diploma: float (nullable = true)
 |-- Men_Less_than_High_School_Diploma: float (nullable = true)
 |-- Women_High_School_Diploma: float (nullable = true)
 |-- Men_High_School_Diploma: float (nullable = true)
 |-- Women_College_or_Associate_Diploma: float (nullable = true)
 |-- Men_College_or_Associate_Diploma: float (nullable = true)
 |-- Women_Bachelor_Diploma_or_Higher: float (nullable = true)
 |-- Men_Bachelor_Diploma_or_Higher: float (nullable = true)

+--------------------+-----------------------------------+---------------------------------+-------------------------+-----------------------+----------------------------------+--------------------------------+--------------------------------+------------------------------+
|               State|Women_Less_than_High_School_Diploma|Men_Less_than_High_School_Diploma|Women_High_School_Diploma|Men_High_School_Diploma|Women_College_or_Associate_Diploma|Men_College_o

In [9]:
education.printSchema()
education.show()
education.toPandas()

root
 |-- State: string (nullable = true)
 |-- Women_Less_than_High_School_Diploma: float (nullable = true)
 |-- Men_Less_than_High_School_Diploma: float (nullable = true)
 |-- Women_High_School_Diploma: float (nullable = true)
 |-- Men_High_School_Diploma: float (nullable = true)
 |-- Women_College_or_Associate_Diploma: float (nullable = true)
 |-- Men_College_or_Associate_Diploma: float (nullable = true)
 |-- Women_Bachelor_Diploma_or_Higher: float (nullable = true)
 |-- Men_Bachelor_Diploma_or_Higher: float (nullable = true)

+--------------------+-----------------------------------+---------------------------------+-------------------------+-----------------------+----------------------------------+--------------------------------+--------------------------------+------------------------------+
|               State|Women_Less_than_High_School_Diploma|Men_Less_than_High_School_Diploma|Women_High_School_Diploma|Men_High_School_Diploma|Women_College_or_Associate_Diploma|Men_College_o

Unnamed: 0,State,Women_Less_than_High_School_Diploma,Men_Less_than_High_School_Diploma,Women_High_School_Diploma,Men_High_School_Diploma,Women_College_or_Associate_Diploma,Men_College_or_Associate_Diploma,Women_Bachelor_Diploma_or_Higher,Men_Bachelor_Diploma_or_Higher
0,Alabama,14.3,16.0,30.299999,32.5,31.9,28.0,23.5,23.4
1,Alaska,7.7,9.0,22.299999,30.4,36.799999,35.099998,33.200001,25.5
2,Arizona,13.8,14.5,25.1,24.4,34.299999,32.900002,26.799999,28.200001
3,Arkansas,15.0,16.1,34.299999,37.299999,30.0,26.200001,20.700001,20.5
4,California,18.0,18.6,20.5,21.299999,30.6,28.9,30.9,31.200001
5,Colorado,8.8,10.2,21.5,21.4,32.200001,30.5,37.5,37.900002
6,Connecticut,9.8,10.9,26.799999,28.299999,26.1,23.1,37.400002,37.700001
7,Delaware,11.6,12.2,30.299999,32.099998,28.700001,25.799999,29.299999,30.0
8,District of Columbia,10.0,10.1,18.9,19.4,17.5,14.9,53.5,55.599998
9,Florida,12.5,14.0,29.6,29.200001,31.200001,28.9,26.700001,27.9


In [10]:
spark.sql(
    """
    SELECT * FROM projeto.Education_State
    """
).show()

+--------------------+---------------------------------+-----------------------------------+-----------------------+-------------------------+--------------------------------+----------------------------------+------------------------------+--------------------------------+
|               State|Men_Less_than_High_School_Diploma|Women_Less_than_High_School_Diploma|Men_High_School_Diploma|Women_High_School_Diploma|Men_College_or_Associate_Diploma|Women_College_or_Associate_Diploma|Men_Bachelor_Diploma_or_Higher|Women_Bachelor_Diploma_or_Higher|
+--------------------+---------------------------------+-----------------------------------+-----------------------+-------------------------+--------------------------------+----------------------------------+------------------------------+--------------------------------+
|             Alabama|                             16.0|                               14.3|                   32.5|                     30.3|                            28.0|