# Registrar tablas en datos externos con opciones de lectura

<a href="https://docs.databricks.com/spark/latest/spark-sql/language-manual/sql-ref-syntax-ddl-create-table-using.html" target="_blank">DDL</a>

<strong>
<code>
CREATE TABLE identificador_de_tabla (nombre_col1 tipo_col1, ...)<br/>
USANDO fuente_de_datos<br/>
OPCIONES (clave1 = valor1, clave2 = valor2, ...)<br/>
UBICACIÓN = ruta<br/>
</code>
</strong>

Spark admite muchas <a href="https://docs.databricks.com/data/data-sources/index.html" target="_blank">fuentes de datos</a> con opciones personalizadas, y sistemas adicionales pueden tener soporte no oficial a través de <a href="https://docs.databricks.com/libraries/index.html" target="_blank">bibliotecas</a> externas.

In [None]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=67ae6c1a102297f772e4f04db92ced78677c2db042d8b16a03cadfac3e66fce6
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.master('local[*]').appName('ext-sources').getOrCreate()
sc = SparkContext.getOrCreate()

La celda a continuación muestra el uso de Spark SQL DDL para crear una tabla contra una fuente externa de CSV, especificando:

* Los nombres y tipos de columna.
* El formato de archivo.
* El delimitador utilizado para separar campos.
* La presencia de un encabezado.
* La ruta donde se almacenan estos datos.

In [None]:
create_california_housing = """
CREATE TABLE IF NOT EXISTS cal_house
  (longitude double, latitude double, housing_median_age double,
   total_rooms double,total_bedrooms double, population double,
   households double, median_income double, median_house_value double
  )
USING CSV
OPTIONS (
  header = "true",
  delimiter = ","
)
LOCATION "/content/sample_data/california_housing/"
"""

In [None]:
print(create_california_housing)


CREATE TABLE IF NOT EXISTS cal_house
  (longitude double, latitude double, housing_median_age double,
   total_rooms double,total_bedrooms double, population double,
   households double, median_income double, median_house_value double
  )
USING CSV
OPTIONS (
  header = "true",
  delimiter = ","
)
LOCATION "/content/sample_data/california_housing/"



In [None]:
spark.sql(create_california_housing)

spark.sql('show databases').show()
spark.sql('show tables').show()

spark.sql('select * from default.cal_house limit 3').show()

+---------+
|namespace|
+---------+
|  default|
+---------+

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|cal_house|      false|
+---------+---------+-----------+

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
+---------+--------+----------------

In [None]:
cal_path = '/content/sample_data/california_housing/'

spark.sql(f"""
CREATE TABLE IF NOT EXISTS cal_house2
  (longitude double, latitude double, housing_median_age double,
   total_rooms double,total_bedrooms double, population double,
   households double, median_income double, median_house_value double
  )
USING CSV
OPTIONS (
  header = "true",
  delimiter = ","
)
LOCATION "${cal_path}"
""")

DataFrame[]

In [None]:
spark.sql('show databases').show()
spark.sql('show tables').show()

df = spark.sql('select * from cal_house2')
df.printSchema()
spark.sql('describe extended cal_house2').show(truncate=False)

+---------+
|namespace|
+---------+
|  default|
+---------+

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|  default| cal_house|      false|
|  default|cal_house2|      false|
+---------+----------+-----------+

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)

+----------------------------+----------------------------------------------------------------------+-------+
|col_name                    |data_type                                                             |comment|
+----------------------------+----------------------------------------------------------------------+--