# Data format overview
## In this tutorial, we will overview evaluate the following data formats
1. avro (structured)
2. csv (semi-structured)
3. json (semi-structured)
4. orc (structured)
5. parquet (structured) 

## We evaluate the data formats via:
1. Disk usage
2. Read/Write latency
3. Random data lookup
4. Filtering/GroupBy(column-wise)
5. Distinct(row-wise)

In [31]:
import os
import s3fs
endpoint = "https://"+os.environ['AWS_S3_ENDPOINT']
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': endpoint})
event_log_path="pengfei/spark-history"

fs.touch('s3://'+event_log_path+'/.keep')
fs.info('pengfei/pengfei_test')

{'Key': 'pengfei/pengfei_test',
 'name': 'pengfei/pengfei_test',
 'type': 'directory',
 'Size': 0,
 'size': 0,
 'StorageClass': 'DIRECTORY'}

In [32]:
from pyspark.sql import SparkSession

spark = SparkSession \
       .builder.master("k8s://https://kubernetes.default.svc:443") \
       .appName("Python Spark SQL basic example") \
       .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
       .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
       .config("spark.executor.instances", "5") \
       .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
       .config("spark.eventLog.enabled","true") \
       .config("spark.eventLog.dir","s3a://"+event_log_path) \
       .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.0.1") \
       .getOrCreate()


In [33]:
json_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.json"
parquet_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.parquet"
avro_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.avro"
orc_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.orc"
csv_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.csv"

# Some useful functions for evaluating data format

### The read function read the source data file and convert it to a spark data frame

In [35]:
data_format_stats_path="../tmp/mystats.csv"
def write_stats(line):
    file1 = open(data_format_stats_path,"a")
    file1.write(line+"\n")

In [46]:
import time
def read(fmt):
    start = time.time()
    if fmt == "json":
        sdf = spark.read.option("header", "true").json(json_data_path)
    elif fmt == "csv":
        sdf = spark.read.option("header", "true").csv(csv_data_path)
    elif fmt == "avro":
        sdf = spark.read.format("avro").load(avro_data_path)
    elif fmt == "parquet":
        sdf = spark.read.parquet(parquet_data_path)
    elif fmt == "orc":
        sdf = spark.read.orc(orc_data_path)
    sdf.show(5,False)
    stats="{}, {}, {}".format(fmt, "read", time.time() - start)
    write_stats(stats)
    print(stats)
    return sdf

### The get_shape function prints the shape(e.g. row number and column number) of the data frame

In [37]:
def get_shape(df,fmt):
    start = time.time()
    row_num=df.count()
    col_num=len(df.columns)
    stats="{}, {}, {}".format(fmt, "get_shape", time.time() - start)
    write_stats(stats)
    print("The data frame has {} rows and {} columns".format(row_num,col_num))
    print(stats)

### The stats function prints the min, max and numbers of a column of the data frame

In [38]:
def stats(df,fmt, field="rating"):
    start = time.time()
    max=df.agg({field: "max"})
    min=df.agg({field: "min"})
    count=df.agg({field: "count"})
    min.show(5,False)
    max.show(5,False)
    count.show(5,False)
    stats="{}, {}, {}".format(fmt, "stats", time.time() - start)
    write_stats(stats)
    print(stats)

### The random_batch function randomly select rows from the data frame. It can evaluate the ability of random data lookup

In [39]:
def random_batch(df,fmt):
    start = time.time()
    result=df.sample(False, 0.05).collect()
    stats="{}, {}, {}".format(fmt, "random_batch", time.time() - start)
    write_stats(stats)
    print(stats)
   # return result

### The distinct function count distinct rows of the data frame

In [40]:
def distinct(df,fmt):
    start = time.time()
    result = df.distinct().count()
    stats="{}, {}, {}".format(fmt, "distinct", time.time() - start)
    write_stats(stats)
    print(stats)
    return result

### The group_by function group and count the data frame by a specific column

In [41]:
def group_by(df,fmt):
    start = time.time()
    result=df.groupBy("rating").count()
    result.show(5,False)
    stats="{}, {}, {}".format(fmt, "group_by", time.time() - start)
    write_stats(stats)
    print(stats)
    #return result

### The filtering function filter data by using a specific boolean condition

In [42]:
def filtering(df, fmt, date="2005-11-15"):
    start = time.time()
    result = df.filter(df.date > date).count()
    stats="{}, {}, {}".format(fmt, "filtering", time.time() - start)
    write_stats(stats)
    print(stats)
    return result

In [43]:
def saveCSV(df,outputPath,fileName): Unit ={
    df.coalesce(1).write.mode("overwrite")
      .option("header","true")
      .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false")
      .option("encoding", "UTF-8")
      .option("delimiter", ",") 
      .csv(outputPath+"/"+fileName)
  }   

# 1. Gathering stats of each data format

## 1.1 Get CSV format evaluation stats

In [44]:
csv_df=read("csv")

+-------+------+----------+
|user_id|rating|date      |
+-------+------+----------+
|1488844|3     |2005-09-06|
|822109 |5     |2005-05-13|
|885013 |4     |2005-10-19|
|30878  |4     |2005-12-26|
|823519 |3     |2004-05-03|
+-------+------+----------+
only showing top 5 rows

csv, read, 8.758424282073975


In [13]:
csv_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- date: string (nullable = true)



In [15]:
get_shape(csv_df,"csv")

The data frame has 24058262 rows and 3 columns
csv, get_shape, 6.950009822845459


In [16]:
# get min, max and row number of column rating
stats(csv_df,"csv",field="rating")

+-----------+
|min(rating)|
+-----------+
|1          |
+-----------+

+-----------+
|max(rating)|
+-----------+
|5          |
+-----------+

+-------------+
|count(rating)|
+-------------+
|24053764     |
+-------------+

csv, stats, 22.299869537353516


In [17]:
random_batch(csv_df,"csv")

csv, random_batch, 15.357799291610718


In [18]:
distinct(csv_df,"csv")

csv, distinct, 19.20894479751587


12168704

In [19]:
group_by(csv_df,"csv")

+------+-------+
|rating|count  |
+------+-------+
|3     |6904181|
|null  |4498   |
|5     |5506583|
|1     |1118186|
|4     |8085741|
+------+-------+
only showing top 5 rows

csv, group_by, 8.280380249023438


In [20]:
filtering(csv_df,"csv")

In [28]:
name="netflix"
csv_df.write.mode("overwrite").option("header", "true").csv("{}.csv".format(name))

## 1.2 Get Json format evaluation stats

In [98]:
json_df=read("json")

+----------+------+-------+
|date      |rating|user_id|
+----------+------+-------+
|2005-09-06|3     |1488844|
|2005-05-13|5     |822109 |
|2005-10-19|4     |885013 |
|2005-12-26|4     |30878  |
|2004-05-03|3     |823519 |
+----------+------+-------+
only showing top 5 rows

json, read, 9.966949701309204


In [99]:
json_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- user_id: string (nullable = true)



In [100]:
get_shape(json_df,"json")

In [111]:
# get min, max and row number of column rating
stats(json_df,"json",field="rating")

+-----------+
|min(rating)|
+-----------+
|1          |
+-----------+

+-----------+
|max(rating)|
+-----------+
|5          |
+-----------+

+-------------+
|count(rating)|
+-------------+
|24053764     |
+-------------+

json, stats, 35.458019971847534


In [101]:
random_batch(json_df,"json")

json, random_batch, 17.13827896118164


In [102]:
distinct(json_df,"json")

json, distinct, 21.003305673599243


12168704

In [115]:
group_by(json_df,"json")

+------+-------+
|rating|count  |
+------+-------+
|3     |6904181|
|null  |4498   |
|5     |5506583|
|1     |1118186|
|4     |8085741|
+------+-------+
only showing top 5 rows

json, group_by, 12.402800798416138


DataFrame[rating: string, count: bigint]

In [103]:
filtering(json_df,"json")

json, filtering, 11.020655632019043


850269

## 1.3 Get Avro format evaluation stats

In [47]:
avro_df=read("avro")

Py4JJavaError: An error occurred while calling o196.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.0 failed 4 times, most recent failure: Lost task 0.3 in stage 3.0 (TID 9) (10.233.118.242 executor 5): java.lang.ClassCastException: cannot assign instance of java.lang.invoke.SerializedLambda to field org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.dataReader$1 of type scala.Function1 in instance of org.apache.spark.sql.execution.datasources.FileFormat$$anon$1
	at java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2301)
	at java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1431)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2411)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:488)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1184)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2296)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:488)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1184)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2296)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:115)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:83)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassCastException: cannot assign instance of java.lang.invoke.SerializedLambda to field org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.dataReader$1 of type scala.Function1 in instance of org.apache.spark.sql.execution.datasources.FileFormat$$anon$1
	at java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2301)
	at java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1431)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2411)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:488)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1184)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2296)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:488)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1184)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2296)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:115)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:83)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


## 1.4 Get Parquet format evaluation stats

In [141]:
parquet_df=read("parquet")

+-------+------+----------+
|user_id|rating|date      |
+-------+------+----------+
|1488844|3     |2005-09-06|
|822109 |5     |2005-05-13|
|885013 |4     |2005-10-19|
|30878  |4     |2005-12-26|
|823519 |3     |2004-05-03|
+-------+------+----------+
only showing top 5 rows

parquet, read, 1.6400139331817627


In [105]:
get_shape(parquet_df,"parquet")

The data frame has 24058262 rows and 3 columns
parquet, get_shape, 1.010782241821289


In [106]:
random_batch(parquet_df,"parquet")

parquet, random_batch, 6.159161567687988


In [112]:
stats(parquet_df,"parquet",field="rating")

+-----------+
|min(rating)|
+-----------+
|1          |
+-----------+

+-----------+
|max(rating)|
+-----------+
|5          |
+-----------+

+-------------+
|count(rating)|
+-------------+
|24053764     |
+-------------+

parquet, stats, 4.589794635772705


In [137]:
distinct(parquet_df,"parquet")

parquet, distinct, 164.0264663696289


12168704

In [117]:
group_by(parquet_df,"parquet")

+------+-------+
|rating|count  |
+------+-------+
|3     |6904181|
|null  |4498   |
|5     |5506583|
|1     |1118186|
|4     |8085741|
+------+-------+
only showing top 5 rows

parquet, group_by, 1.5566697120666504


In [119]:
filtering(parquet_df,"parquet")

parquet, filtering, 1.3744680881500244


850269

# 1.5 Get ORC format evaluation stats

In [130]:
orc_df=read("orc")

+-------+------+----------+
|user_id|rating|date      |
+-------+------+----------+
|1488844|3     |2005-09-06|
|822109 |5     |2005-05-13|
|885013 |4     |2005-10-19|
|30878  |4     |2005-12-26|
|823519 |3     |2004-05-03|
+-------+------+----------+
only showing top 5 rows

orc, read, 2.3085367679595947


In [132]:
get_shape(orc_df,"orc")

The data frame has 24058262 rows and 3 columns
orc, get_shape, 1.4532253742218018


In [133]:
random_batch(orc_df,"orc")

orc, random_batch, 6.4762678146362305


In [134]:
stats(orc_df,"orc",field="rating")

+-----------+
|min(rating)|
+-----------+
|1          |
+-----------+

+-----------+
|max(rating)|
+-----------+
|5          |
+-----------+

+-------------+
|count(rating)|
+-------------+
|24053764     |
+-------------+

orc, stats, 4.612210035324097


In [138]:
distinct(orc_df,"orc")

orc, distinct, 185.58755350112915


12168704

In [135]:
group_by(orc_df,"orc")

+------+-------+
|rating|count  |
+------+-------+
|3     |6904181|
|null  |4498   |
|5     |5506583|
|1     |1118186|
|4     |8085741|
+------+-------+
only showing top 5 rows

orc, group_by, 1.680478811264038


In [136]:
filtering(orc_df,"orc")

orc, filtering, 1.186652421951294


850269

# 2. Visualize the stats of different format 

## 2.1 Visualize the read latency for each format

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# read file 
data_format_stats=s3.open(data_format_stats_path, mode='rb')
pokemon=pd.read_csv(data_format_stats,index_col=0)


# Stop the spark cluster

In [29]:
# stop sparksession
spark.sparkContext.stop()

### Check if the spark cluster is well closed. You should not see any python-spark pods

In [34]:
! kubectl get pods

NAME                                                     READY   STATUS      RESTARTS   AGE
deleting-pods-with-completed-status-1615824000-2nwxn     0/1     Completed   0          58m
jupyter-1615800186-567f67779b-cbmgx                      1/1     Running     0          7h35m
python-spark-sql-basic-example-6e3a6b7836d2d444-exec-1   1/1     Running     0          21s
python-spark-sql-basic-example-6e3a6b7836d2d444-exec-2   1/1     Running     0          21s
python-spark-sql-basic-example-6e3a6b7836d2d444-exec-3   1/1     Running     0          20s
python-spark-sql-basic-example-6e3a6b7836d2d444-exec-4   1/1     Running     0          20s
python-spark-sql-basic-example-6e3a6b7836d2d444-exec-5   1/1     Running     0          20s
ubuntu-1612965548-79c9567b44-nhs9p                       1/1     Running     1          33d
