In [1]:
from pyspark.sql.functions import expr
from pyspark.sql.functions import from_unixtime

events = spark.read \
  .option("inferSchema", "true")\
  .json("/databricks-datasets/structured-streaming/events/")\
  .withColumn("date", expr("time"))\
  .withColumn("date", from_unixtime("date", 'yyyy-MM-dd'))\
  .withColumn("Anio", expr("substring(date,1,4)"))\
  .withColumn("Mes", expr("substring(date,6,2)"))\
  .drop("time") 

display(events)            

action,date,Anio,Mes
Close,2016-07-28,2016,7
Close,2016-07-28,2016,7
Open,2016-07-28,2016,7
Close,2016-07-28,2016,7
Open,2016-07-28,2016,7
Open,2016-07-28,2016,7
Close,2016-07-28,2016,7
Close,2016-07-28,2016,7
Close,2016-07-28,2016,7
Open,2016-07-28,2016,7


In [2]:
events.write.format("delta").mode("overwrite").partitionBy("Anio").save("/delta/events/")


In [3]:
dbutils.fs.ls("dbfs:/delta/events/")

In [4]:
%fs
rm -r "dbfs:/delta/events/"

In [5]:
events_delta = spark.read.format("delta").load("/delta/events/")

display(events_delta)

action,date,Anio,Mes
Open,2016-07-27,2016,7
Close,2016-07-27,2016,7
Close,2016-07-27,2016,7
Open,2016-07-27,2016,7
Close,2016-07-27,2016,7
Close,2016-07-27,2016,7
Close,2016-07-27,2016,7
Close,2016-07-27,2016,7
Open,2016-07-27,2016,7
Open,2016-07-27,2016,7


In [6]:
display(spark.sql("DROP TABLE IF EXISTS events"))

display(spark.sql("CREATE TABLE events USING DELTA LOCATION '/delta/events/'"))

In [7]:
events_delta.count()

In [8]:
from pyspark.sql.functions import count
display(events_delta.groupBy("action","date").agg(count("action").alias("action_count")).orderBy("date", "action"))

action,date,action_count
Close,2016-07-26,20165
Open,2016-07-26,21176
Close,2016-07-27,24015
Open,2016-07-27,24002
Close,2016-07-28,5820
Open,2016-07-28,4822


In [9]:
historical_events = spark.read \
  .option("inferSchema", "true") \
  .json("/databricks-datasets/structured-streaming/events/") \
  .withColumn("date", expr("time-172800")) \
  .drop("time") \
  .withColumn("date", from_unixtime("date", 'yyyy-MM-dd'))\
  .withColumn("Anio", expr("substring(date,1,4)"))\
  .withColumn("Mes", expr("substring(date,6,2)"))
display(historical_events)

action,date,Anio,Mes
Close,2016-07-26,2016,7
Close,2016-07-26,2016,7
Open,2016-07-26,2016,7
Close,2016-07-26,2016,7
Open,2016-07-26,2016,7
Open,2016-07-26,2016,7
Close,2016-07-26,2016,7
Close,2016-07-26,2016,7
Close,2016-07-26,2016,7
Open,2016-07-26,2016,7


In [10]:
historical_events.write.format("delta").mode("append").partitionBy("Anio").save("/delta/events/")

In [11]:
dbutils.fs.ls("dbfs:/delta/events/")
events_delta = spark.read.format("delta").load("/delta/events/")
events_delta.count()

In [12]:
display(events_delta.groupBy("action","date").agg(count("action").alias("action_count")).orderBy("date", "action"))

action,date,action_count
Close,2016-07-24,20165
Open,2016-07-24,21176
Close,2016-07-25,24015
Open,2016-07-25,24002
Close,2016-07-26,25985
Open,2016-07-26,25998
Close,2016-07-27,24015
Open,2016-07-27,24002
Close,2016-07-28,5820
Open,2016-07-28,4822


In [13]:
events_delta.count()

In [14]:
dbutils.fs.ls("dbfs:/delta/events/")

In [15]:
#Borrar los datos cargados en events
%fs
rm -r "dbfs:/delta/events/"

In [16]:
display(spark.sql("OPTIMIZE events"))

path
""


In [17]:
display(spark.sql("DESCRIBE HISTORY events"))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel
2,2019-06-17T09:05:06.000+0000,4372415976254024,emunozcar@dragosolutions.com,OPTIMIZE,"Map(predicate -> [], zOrderBy -> [], batchId -> 0, auto -> false)",,List(2131966090072072),0503-100013-pins355,1.0,SnapshotIsolation
1,2019-06-17T09:01:34.000+0000,4372415976254024,emunozcar@dragosolutions.com,WRITE,"Map(mode -> Append, partitionBy -> [""Anio""])",,List(2131966090072072),0503-100013-pins355,0.0,WriteSerializable
0,2019-06-17T08:49:01.000+0000,4372415976254024,emunozcar@dragosolutions.com,WRITE,"Map(mode -> Overwrite, partitionBy -> [""Anio""])",,List(2131966090072072),0503-100013-pins355,,WriteSerializable


In [18]:
display(spark.sql("DESCRIBE DETAIL events"))

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion
delta,38206e73-7443-4958-9c05-2650abfb8bdd,,,dbfs:/delta/events,2019-06-17T08:48:56.498+0000,2019-06-17T09:05:06.000+0000,List(Anio),1,26334,Map(),1,2


In [19]:
display(spark.sql("DESCRIBE FORMATTED events"))

col_name,data_type,comment
action,string,
date,string,
Anio,string,
Mes,string,
# Partition Information,,
# col_name,data_type,comment
Anio,string,
,,
# Detailed Table Information,,
Database,default,
