# Data format overview
## In this tutorial, we will overview evaluate the following data formats
1. avro (structured)
2. csv (semi-structured)
3. json (semi-structured)
4. orc (structured)
5. parquet (structured) 

## We evaluate the data formats via:
1. Disk usage
2. Read/Write latency
3. Random data lookup
4. Filtering/GroupBy(column-wise)
5. Distinct(row-wise)

In [63]:
import os
import s3fs
endpoint = "https://"+os.environ['AWS_S3_ENDPOINT']
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': endpoint})
event_log_path="pengfei/spark-history"

fs.touch('s3://'+event_log_path+'/.keep')
fs.info('pengfei/pengfei_test')

{'Key': 'pengfei/pengfei_test',
 'name': 'pengfei/pengfei_test',
 'type': 'directory',
 'Size': 0,
 'size': 0,
 'StorageClass': 'DIRECTORY'}

In [62]:
from pyspark.sql import SparkSession

spark = SparkSession \
       .builder.master("k8s://https://kubernetes.default.svc:443") \
       .appName("Python Spark SQL basic example") \
       .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
       .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
       .config("spark.executor.instances", "5") \
       .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
       .config("spark.eventLog.enabled","true") \
       .config("spark.eventLog.dir","s3a://"+event_log_path) \
       .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.0.1") \
       .getOrCreate()


In [64]:
json_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.json"
parquet_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.parquet"
avro_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.avro"
orc_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.orc"
csv_data_path="s3a://pengfei/sspcloud-demo/data_format/netflix.csv"

# Some useful functions

### The read function read the source data file and convert it to a spark data frame

In [142]:
def write_stats(line):
    path="../tmp/mystats.csv"
    file1 = open(path,"a")
    file1.write(line+"\n")

In [129]:
import time
def read(fmt):
    start = time.time()
    if fmt == "json":
        sdf = spark.read.option("header", "true").json(json_data_path)
    elif fmt == "csv":
        sdf = spark.read.option("header", "true").csv(csv_data_path)
    elif fmt == "avro":
        sdf = spark.read.format("avro").option("header", "true").load(avro_data_path)
    elif fmt == "parquet":
        sdf = spark.read.option("header", "true").parquet(parquet_data_path)
    elif fmt == "orc":
        sdf = spark.read.orc(orc_data_path)
    sdf.show(5,False)
    stats="{}, {}, {}".format(fmt, "read", time.time() - start)
    write_stats(stats)
    print(stats)
    return sdf

### The get_shape function prints the shape(e.g. row number and column number) of the data frame

In [92]:
def get_shape(df,fmt):
    start = time.time()
    row_num=df.count()
    col_num=len(df.columns)
    stats="{}, {}, {}".format(fmt, "get_shape", time.time() - start)
    write_stats(stats)
    print("The data frame has {} rows and {} columns".format(row_num,col_num))
    print(stats)

### The stats function prints the min, max and numbers of a column of the data frame

In [110]:
def stats(df,fmt, field="rating"):
    start = time.time()
    max=df.agg({field: "max"})
    min=df.agg({field: "min"})
    count=df.agg({field: "count"})
    min.show(5,False)
    max.show(5,False)
    count.show(5,False)
    stats="{}, {}, {}".format(fmt, "stats", time.time() - start)
    write_stats(stats)
    print(stats)

### The random_batch function randomly select rows from the data frame. It can evaluate the ability of random data lookup

In [94]:
def random_batch(df,fmt):
    start = time.time()
    result=df.sample(False, 0.05).collect()
    stats="{}, {}, {}".format(fmt, "random_batch", time.time() - start)
    write_stats(stats)
    print(stats)
   # return result

### The distinct function count distinct rows of the data frame

In [95]:
def distinct(df,fmt):
    start = time.time()
    result = df.distinct().count()
    stats="{}, {}, {}".format(fmt, "distinct", time.time() - start)
    write_stats(stats)
    print(stats)
    return result

### The group_by function group and count the data frame by a specific column

In [116]:
def group_by(df,fmt):
    start = time.time()
    result=df.groupBy("rating").count()
    result.show(5,False)
    stats="{}, {}, {}".format(fmt, "group_by", time.time() - start)
    write_stats(stats)
    print(stats)
    #return result

### The filtering function filter data by using a specific boolean condition

In [97]:
def filtering(df, fmt, date="2005-11-15"):
    start = time.time()
    result = df.filter(df.date > date).count()
    stats="{}, {}, {}".format(fmt, "filtering", time.time() - start)
    write_stats(stats)
    print(stats)
    return result

# 1. CSV format evaluation

In [120]:
csv_df=read("csv")

+-------+------+----------+
|user_id|rating|date      |
+-------+------+----------+
|1488844|3     |2005-09-06|
|822109 |5     |2005-05-13|
|885013 |4     |2005-10-19|
|30878  |4     |2005-12-26|
|823519 |3     |2004-05-03|
+-------+------+----------+
only showing top 5 rows

csv, read, 0.8125045299530029


In [121]:
csv_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- date: string (nullable = true)



In [122]:
get_shape(json_df,"csv")

The data frame has 24058262 rows and 3 columns
csv, get_shape, 9.196694374084473


In [123]:
# get min, max and row number of column rating
stats(json_df,"csv",field="rating")

+-----------+
|min(rating)|
+-----------+
|1          |
+-----------+

+-----------+
|max(rating)|
+-----------+
|5          |
+-----------+

+-------------+
|count(rating)|
+-------------+
|24053764     |
+-------------+

csv, stats, 36.65289235115051


In [124]:
random_batch(json_df,"csv")

csv, random_batch, 17.773473978042603


In [125]:
distinct(json_df,"csv")

csv, distinct, 21.709513187408447


12168704

In [126]:
group_by(json_df,"csv")

+------+-------+
|rating|count  |
+------+-------+
|3     |6904181|
|null  |4498   |
|5     |5506583|
|1     |1118186|
|4     |8085741|
+------+-------+
only showing top 5 rows

csv, group_by, 12.347777128219604


In [127]:
filtering(json_df,"csv")

csv, filtering, 10.43360447883606


850269

# 2. Json format evaluation

In [98]:
json_df=read("json")

+----------+------+-------+
|date      |rating|user_id|
+----------+------+-------+
|2005-09-06|3     |1488844|
|2005-05-13|5     |822109 |
|2005-10-19|4     |885013 |
|2005-12-26|4     |30878  |
|2004-05-03|3     |823519 |
+----------+------+-------+
only showing top 5 rows

json, read, 9.966949701309204


In [99]:
json_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- user_id: string (nullable = true)



In [100]:
get_shape(json_df,"json")

In [111]:
# get min, max and row number of column rating
stats(json_df,"json",field="rating")

+-----------+
|min(rating)|
+-----------+
|1          |
+-----------+

+-----------+
|max(rating)|
+-----------+
|5          |
+-----------+

+-------------+
|count(rating)|
+-------------+
|24053764     |
+-------------+

json, stats, 35.458019971847534


In [101]:
random_batch(json_df,"json")

json, random_batch, 17.13827896118164


In [102]:
distinct(json_df,"json")

json, distinct, 21.003305673599243


12168704

In [115]:
group_by(json_df,"json")

+------+-------+
|rating|count  |
+------+-------+
|3     |6904181|
|null  |4498   |
|5     |5506583|
|1     |1118186|
|4     |8085741|
+------+-------+
only showing top 5 rows

json, group_by, 12.402800798416138


DataFrame[rating: string, count: bigint]

In [103]:
filtering(json_df,"json")

json, filtering, 11.020655632019043


850269

# 3. Avro format evaluation

In [67]:
avro_df=read("avro")

# 4. Parquet format evaluation

In [141]:
parquet_df=read("parquet")

+-------+------+----------+
|user_id|rating|date      |
+-------+------+----------+
|1488844|3     |2005-09-06|
|822109 |5     |2005-05-13|
|885013 |4     |2005-10-19|
|30878  |4     |2005-12-26|
|823519 |3     |2004-05-03|
+-------+------+----------+
only showing top 5 rows

parquet, read, 1.6400139331817627


In [105]:
get_shape(parquet_df,"parquet")

The data frame has 24058262 rows and 3 columns
parquet, get_shape, 1.010782241821289


In [106]:
random_batch(parquet_df,"parquet")

parquet, random_batch, 6.159161567687988


In [112]:
stats(parquet_df,"parquet",field="rating")

+-----------+
|min(rating)|
+-----------+
|1          |
+-----------+

+-----------+
|max(rating)|
+-----------+
|5          |
+-----------+

+-------------+
|count(rating)|
+-------------+
|24053764     |
+-------------+

parquet, stats, 4.589794635772705


In [137]:
distinct(parquet_df,"parquet")

parquet, distinct, 164.0264663696289


12168704

In [117]:
group_by(parquet_df,"parquet")

+------+-------+
|rating|count  |
+------+-------+
|3     |6904181|
|null  |4498   |
|5     |5506583|
|1     |1118186|
|4     |8085741|
+------+-------+
only showing top 5 rows

parquet, group_by, 1.5566697120666504


In [119]:
filtering(parquet_df,"parquet")

parquet, filtering, 1.3744680881500244


850269

# 5. ORC format evaluation

In [130]:
orc_df=read("orc")

+-------+------+----------+
|user_id|rating|date      |
+-------+------+----------+
|1488844|3     |2005-09-06|
|822109 |5     |2005-05-13|
|885013 |4     |2005-10-19|
|30878  |4     |2005-12-26|
|823519 |3     |2004-05-03|
+-------+------+----------+
only showing top 5 rows

orc, read, 2.3085367679595947


In [132]:
get_shape(orc_df,"orc")

The data frame has 24058262 rows and 3 columns
orc, get_shape, 1.4532253742218018


In [133]:
random_batch(orc_df,"orc")

orc, random_batch, 6.4762678146362305


In [134]:
stats(orc_df,"orc",field="rating")

+-----------+
|min(rating)|
+-----------+
|1          |
+-----------+

+-----------+
|max(rating)|
+-----------+
|5          |
+-----------+

+-------------+
|count(rating)|
+-------------+
|24053764     |
+-------------+

orc, stats, 4.612210035324097


In [138]:
distinct(orc_df,"orc")

orc, distinct, 185.58755350112915


12168704

In [135]:
group_by(orc_df,"orc")

+------+-------+
|rating|count  |
+------+-------+
|3     |6904181|
|null  |4498   |
|5     |5506583|
|1     |1118186|
|4     |8085741|
+------+-------+
only showing top 5 rows

orc, group_by, 1.680478811264038


In [136]:
filtering(orc_df,"orc")

orc, filtering, 1.186652421951294


850269

# Stop the spark cluster

In [59]:
# stop sparksession
spark.sparkContext.stop()

### Check if the spark cluster is well closed. You should not see any python-spark pods

In [68]:
! kubectl get pods

NAME                                                     READY   STATUS      RESTARTS   AGE
deleting-pods-with-completed-status-1615298400-jk8c6     0/1     Completed   0          51m
jupyter-1615290850-69d9fbd8d6-rlwhs                      1/1     Running     0          177m
python-spark-sql-basic-example-ca8c007817777a02-exec-1   1/1     Running     0          84s
python-spark-sql-basic-example-ca8c007817777a02-exec-2   1/1     Running     0          84s
python-spark-sql-basic-example-ca8c007817777a02-exec-3   1/1     Running     0          84s
python-spark-sql-basic-example-ca8c007817777a02-exec-4   1/1     Running     0          84s
python-spark-sql-basic-example-ca8c007817777a02-exec-5   1/1     Running     0          84s
ubuntu-1612965548-79c9567b44-nhs9p                       1/1     Running     0          27d
