In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (SparkSession.builder
             .master("spark://spark-master:7077") # Points to the Spark Cluster
             .appName('schema-test') # Name the app
             .config("hive.metastore.uris", "thrift://hive-metastore:9083") # Set external Hive Metastore
             .config("hive.metastore.warehouse.dir", "hdfs://hdfs-namenode:9000/hadoop/warehouse/") # Set default warehouse dir (legacy) users/hive/warehouse
             .config("spark.sql.warehouse.dir", "hdfs://hdfs-namenode:9000/hadoop/warehouse/") # Set default warehouse dir
             .config("hive.metastore.schema.verification", "false") # Prevent some errors
             .config("fs.defaultFS", "hdfs://hdfs-namenode:9000/") # Set default file system into the HDFS namenode
             .enableHiveSupport()
             .getOrCreate())

sc = spark.sparkContext

In [2]:
df = spark.createDataFrame([
    {"name": "Jhon", "age": 35},
    {"name": "Eric", "age": 31}
], schema=StructType([StructField("name", StringType(), True), StructField("age", IntegerType(), True)]))

df.show()

+----+---+
|name|age|
+----+---+
|Jhon| 35|
|Eric| 31|
+----+---+



In [3]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [5]:
df.write.mode("overwrite").parquet("hdfs://hdfs-namenode:9000/hadoop/warehouse/test/")

In [14]:
spark.sql("CREATE EXTERNAL TABLE default.persons (name string, age int) USING PARQUET LOCATION 'hdfs://hdfs-namenode:9000/hadoop/warehouse/persons/'").show()

++
||
++
++



In [15]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [16]:
spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|  persons|      false|
+---------+---------+-----------+



In [10]:
df.write.insertInto("default.persons")

In [4]:
tt = spark.read.parquet("hdfs://hdfs-namenode:9000/hadoop/warehouse/test/")

tt.show()

+----+---+
|name|age|
+----+---+
|Jhon| 35|
|Eric| 31|
+----+---+



In [6]:
spark.sql("select * from default.persons").show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `default`.`persons` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [default, persons], [], false
