In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row


In [2]:
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("American Crimes") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+--------------+
|     namespace|
+--------------+
|americancrimes|
|       default|
|          demo|
+--------------+



In [4]:
spark.sql(
    """
    DROP DATABASE IF EXISTS americancrimes CASCADE
    """
)

DataFrame[]

In [5]:
spark.sql(
    """
    CREATE DATABASE americancrimes LOCATION 'hdfs://hdfs-nn:9000/warehouse/americancrimes.db/'
    """
)

DataFrame[]

In [6]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+--------------+
|     namespace|
+--------------+
|americancrimes|
|       default|
|          demo|
+--------------+



In [7]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [8]:
spark.sql(
    """
    DROP TABLE IF EXISTS americancrimes.crime_by_race
    """
).show()

spark.sql(
    """
    CREATE TABLE americancrimes.crime_by_race (
        offense_charged VARCHAR(50),
        white INT,
        black INT,
        native INT,
        asian_or_pacific_islander INT
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year INT
    )
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/americancrimes.db/crime_by_race/'
    """
)

++
||
++
++



DataFrame[]

In [9]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+-------------+-----------+
|      database|    tableName|isTemporary|
+--------------+-------------+-----------+
|americancrimes|crime_by_race|      false|
+--------------+-------------+-----------+



In [10]:
spark.sql(
    """
    SELECT *
    FROM americancrimes.crime_by_race
    """
).show()

+---------------+-----+-----+------+-------------------------+----+
|offense_charged|white|black|native|asian_or_pacific_islander|year|
+---------------+-----+-----+------+-------------------------+----+
+---------------+-----+-----+------+-------------------------+----+



In [11]:
spark.sql(
    """
    DROP TABLE IF EXISTS americancrimes.crime_by_age
    """
).show()

spark.sql(
    """
    CREATE TABLE americancrimes.crime_by_age (
        offense_charged VARCHAR(50),
        age VARCHAR(10),
        quantity INT
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year INT
    )
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/americancrimes.db/crime_by_age/'
    """
)

++
||
++
++



DataFrame[]

In [12]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+-------------+-----------+
|      database|    tableName|isTemporary|
+--------------+-------------+-----------+
|americancrimes| crime_by_age|      false|
|americancrimes|crime_by_race|      false|
+--------------+-------------+-----------+



In [13]:
spark.sql(
    """
    SELECT *
    FROM americancrimes.crime_by_age
    """
).show()

+---------------+---+--------+----+
|offense_charged|age|quantity|year|
+---------------+---+--------+----+
+---------------+---+--------+----+



In [14]:
spark.stop()