In [None]:
import sys
import numpy as np
import pandas as pd
print("PYTHONPATH: {}".format(os.environ['PYTHONPATH']))
print("Spark: {}".format(spark.version))
print("Python: {}".format(sys.version))
spark.sparkContext

## Show Hive Tables

In [None]:
spark.sql("show tables").toPandas()

## Leo, guardo y consulto tabla externa Parquet

In [None]:
days = pd.DataFrame({"day": ['mon', 'tue', 'wed', 'fri'], 
                    "sales": [5, 1, 2, 3]})
df = spark.createDataFrame(days)
df.toPandas()

In [None]:
# escribo datos en parquet 
dataDir = "/tmp/days_parquet"
df.write.parquet(dataDir, mode="overwrite")

# creo tabla externa
spark.sql("DROP TABLE IF EXISTS days_ext")
spark.sql(f"CREATE EXTERNAL TABLE days_ext (day string, sales bigint) STORED AS PARQUET LOCATION '{dataDir}'")

In [None]:
# consulto la tabla 
spark.sql("SELECT * FROM days_ext").orderBy('day').toPandas()

In [None]:
from pyspark.sql.functions import *

df2 = df.select('day', (2 * col('sales')).alias('sales'))
df2.toPandas()

In [None]:
df2.write.parquet(dataDir, mode="append")

In [None]:
# consulto la tabla 
spark.sql("SELECT * FROM days_ext").orderBy('day').toPandas()

In [None]:
spark.sql("DROP TABLE IF EXISTS days_ext")
spark.sql('show tables').toPandas()

In [None]:
%%bash
hdfs dfs -rm -r -f -skipTrash /tmp/days_parquet

# Salvar DataFrame como Tabla Hive

In [None]:
house_dom = spark.sparkContext.textFile("/data/CaliforniaHousing/cal_housing.domain")\
    .map(lambda s: s.split(':')[0])
columns = house_dom.collect()

house_data = spark.read.csv("/data/CaliforniaHousing/cal_housing.data", inferSchema=True)\
    .toDF(*columns)
house_data.printSchema()

In [None]:
house_data.write.mode("overwrite").format("orc").saveAsTable("housing")

In [None]:
spark.sql("SELECT * FROM housing").limit(5).toPandas()

## Guardar Dataframe como TempView y guardar con CTAS

In [None]:
from pyspark.sql import Row

Record = Row("key", "value")
records_df = spark.createDataFrame([Record(i, "val_" + str(i)) for i in range(1, 5)])
records_df.createOrReplaceTempView("records_view")
spark.sql("SELECT * FROM records_view").toPandas()

In [None]:
spark.sql("DROP TABLE IF EXISTS records")

In [None]:
%%bash
hdfs dfs -rm -r -f -skipTrash /tmp/records

In [None]:
spark.sql("CREATE EXTERNAL TABLE records STORED AS orc LOCATION '/tmp/records' AS SELECT * FROM records_view")

In [None]:
spark.sql("SHOW TABLES").show()

In [None]:
spark.sql("SELECT * FROM records").toPandas()

## saveAsTable

Recomiendo no usar pq es inestable y da error en varias situaciones