# Data format overview
## In this tutorial, we will overview evaluate the following data formats
1. avro (structured)
2. csv (semi-structured)
3. json (semi-structured)
4. orc (structured)
5. parquet (structured) 

## 
for their disk usage, 

In [63]:
import os
import s3fs
endpoint = "https://"+os.environ['AWS_S3_ENDPOINT']
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': endpoint})
event_log_path="pengfei/spark-history"

fs.touch('s3://'+event_log_path+'/.keep')
fs.info('pengfei/pengfei_test')

{'Key': 'pengfei/pengfei_test',
 'name': 'pengfei/pengfei_test',
 'type': 'directory',
 'Size': 0,
 'size': 0,
 'StorageClass': 'DIRECTORY'}

In [62]:
from pyspark.sql import SparkSession

spark = SparkSession \
       .builder.master("k8s://https://kubernetes.default.svc:443") \
       .appName("Python Spark SQL basic example") \
       .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
       .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
       .config("spark.executor.instances", "5") \
       .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
       .config("spark.eventLog.enabled","true") \
       .config("spark.eventLog.dir","s3a://"+event_log_path) \
       .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.0.1") \
       .getOrCreate()


In [64]:
json_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.json"
parquet_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.parquet"
avro_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.avro"
orc_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.orc"
csv_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.csv"

# Some useful functions

### The read function read the source data file and convert it to a spark data frame

In [66]:
import time
def read(fmt):
    start = time.time()
    if fmt == "json":
        sdf = spark.read.option("header", "true").json(json_data_path)
    elif fmt == "csv":
        sdf = spark.read.option("header", "true").csv(csv_data_path)
    elif fmt == "avro":
        sdf = spark.read.format("avro").option("header", "true").load(avro_data_path)
    elif fmt == "parquet":
        sdf = spark.read.option("header", "true").parquet(parquet_data_path)
    elif fmt == "orc":
        sdf = spark.read.read.orc(orc_data_path)
    sdf.show(5,False)
    print("{}, {}, {}".format(fmt, "read", time.time() - start))
    return sdf

### The get_shape function prints the shape(e.g. row number and column number) of the data frame

In [41]:
def get_shape(df,fmt):
    start = time.time()
    row_num=df.count()
    col_num=len(df.columns)
    print("The data frame has {} rows and {} columns".format(row_num,col_num))
    print("{}, {}, {}".format(fmt, "get_shape", time.time() - start))

### The stats function prints the min, max and numbers of a column of the data frame

In [33]:
def stats(df,fmt, field="rating"):
    start = time.time()
    max=df.agg({field: "max"})
    min=df.agg({field: "min"})
    count=df.agg({field: "count"})
    print("{}, {}, {}".format(fmt, "random_batch", time.time() - start))
    min.show(5,False)
    max.show(5,False)
    count.show(5,False)

### The random_batch function randomly select rows from the data frame. It can evaluate the ability of random data lookup

In [23]:
def random_batch(df,fmt):
    start = time.time()
    result=df.sample(False, 0.05).collect()
    print("{}, {}, {}".format(fmt, "random_batch", time.time() - start))
   # return result

In [55]:
def distinct(df,fmt):
    start = time.time()
    result = df.distinct().count()
    print("{}, {}, {}".format(fmt, "distinct", time.time() - start))
    return result

In [38]:
def group_by(df,fmt):
    start = time.time()
    result=df.groupBy("rating").count()
    print("{}, {}, {}".format(fmt, "group_by", time.time() - start))
    result.show(5,False)
    return result

In [75]:
def filtering(df, fmt, date="2005-11-15"):
    start = time.time()
    result = df.filter(df.date > date).count()
    print("{}, {}, {}".format(fmt, "filtering", time.time() - start))
    return result

# Json format evaluation

In [69]:
json_df=read("json")

In [77]:
json_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- user_id: string (nullable = true)



In [70]:
get_shape(json_df,"json")

The data frame has 24058262 rows and 3 columns
json, get_shape, 8.26310682296753


In [71]:
random_batch(json_df,"json")

json, random_batch, 15.675529956817627


In [72]:
distinct(json_df,"json")

In [76]:
filtering(json_df,"json")

json, filtering, 10.1075279712677


850269

# Avro format evaluation

In [67]:
avro_df=read("avro")

# Parquet format evaluation

In [26]:
parquet_df=read("parquet")

+-------+------+----------+
|user_id|rating|date      |
+-------+------+----------+
|1488844|3     |2005-09-06|
|822109 |5     |2005-05-13|
|885013 |4     |2005-10-19|
|30878  |4     |2005-12-26|
|823519 |3     |2004-05-03|
+-------+------+----------+
only showing top 5 rows

parquet, read, 2.3800978660583496


In [43]:
get_shape(parquet_df,"parquet")

The data frame has 24058262 rows and 3 columns
parquet, get_shape, 1.3256816864013672


In [28]:
random_batch(parquet_df,"parquet")

parquet, random_batch, 7.559308767318726


In [34]:
stats(parquet_df,"parquet")

parquet, random_batch, 0.03509926795959473
+-----------+
|min(rating)|
+-----------+
|1          |
+-----------+

+-----------+
|max(rating)|
+-----------+
|5          |
+-----------+

+-------------+
|count(rating)|
+-------------+
|24053764     |
+-------------+



In [39]:
group_by(parquet_df,"parquet")

parquet, group_by, 0.015201091766357422
+------+-------+
|rating|count  |
+------+-------+
|3     |6904181|
|null  |4498   |
|5     |5506583|
|1     |1118186|
|4     |8085741|
+------+-------+
only showing top 5 rows



DataFrame[rating: string, count: bigint]

In [59]:
# stop sparksession
spark.sparkContext.stop()

In [68]:
! kubectl get pods

NAME                                                     READY   STATUS      RESTARTS   AGE
deleting-pods-with-completed-status-1615298400-jk8c6     0/1     Completed   0          51m
jupyter-1615290850-69d9fbd8d6-rlwhs                      1/1     Running     0          177m
python-spark-sql-basic-example-ca8c007817777a02-exec-1   1/1     Running     0          84s
python-spark-sql-basic-example-ca8c007817777a02-exec-2   1/1     Running     0          84s
python-spark-sql-basic-example-ca8c007817777a02-exec-3   1/1     Running     0          84s
python-spark-sql-basic-example-ca8c007817777a02-exec-4   1/1     Running     0          84s
python-spark-sql-basic-example-ca8c007817777a02-exec-5   1/1     Running     0          84s
ubuntu-1612965548-79c9567b44-nhs9p                       1/1     Running     0          27d
