In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row


In [3]:
warehouse_location = 'hdfs://namenode:8020/warehouse'

spark = SparkSession \
    .builder \
    .appName("American Crimes") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

21/11/19 17:33:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [15]:
spark.sql(
    """
    DROP DATABASE IF EXISTS sensors CASCADE
    """
)

DataFrame[]

In [16]:
spark.sql(
    """
    CREATE DATABASE sensors LOCATION 'hdfs://namenode:8020/warehouse/sensors.db/'
    """
)

DataFrame[]

In [17]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+---------+
|namespace|
+---------+
|  default|
|  sensors|
+---------+



In [18]:
spark.sql(
    """
    SHOW TABLES FROM sensors
    """
).show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [8]:
spark.sql(
    """
    DROP TABLE IF EXISTS sensors.fails
    """
).show()

spark.sql(
    """
    CREATE TABLE americancrimes.crime_by_race (
        offense_charged VARCHAR(50),
        white INT,
        black INT,
        native INT,
        asian_or_pacific_islander INT
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year INT
    )
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/americancrimes.db/crime_by_race/'
    """
)

++
||
++
++



DataFrame[]

In [9]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+-------------+-----------+
|      database|    tableName|isTemporary|
+--------------+-------------+-----------+
|americancrimes|crime_by_race|      false|
+--------------+-------------+-----------+



In [10]:
spark.sql(
    """
    SELECT *
    FROM americancrimes.crime_by_race
    """
).show()

+---------------+-----+-----+------+-------------------------+----+
|offense_charged|white|black|native|asian_or_pacific_islander|year|
+---------------+-----+-----+------+-------------------------+----+
+---------------+-----+-----+------+-------------------------+----+



In [11]:
spark.sql(
    """
    DROP TABLE IF EXISTS americancrimes.crime_by_age
    """
).show()

spark.sql(
    """
    CREATE TABLE americancrimes.crime_by_age (
        offense_charged VARCHAR(50),
        age VARCHAR(10),
        quantity INT
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year INT
    )
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/americancrimes.db/crime_by_age/'
    """
)

++
||
++
++



DataFrame[]

In [12]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+-------------+-----------+
|      database|    tableName|isTemporary|
+--------------+-------------+-----------+
|americancrimes| crime_by_age|      false|
|americancrimes|crime_by_race|      false|
+--------------+-------------+-----------+



In [13]:
spark.sql(
    """
    SELECT *
    FROM americancrimes.crime_by_age
    """
).show()

+---------------+---+--------+----+
|offense_charged|age|quantity|year|
+---------------+---+--------+----+
+---------------+---+--------+----+



In [14]:
spark.stop()