# Data format optimization parquet

In [1]:
! /opt/spark/bin/spark-submit --help

Usage: spark-submit [options] <app jar | python file | R file> [app arguments]
Usage: spark-submit --kill [submission ID] --master [spark://...]
Usage: spark-submit --status [submission ID] --master [spark://...]
Usage: spark-submit run-example [options] example-class [example args]

Options:
  --master MASTER_URL         spark://host:port, mesos://host:port, yarn,
                              k8s://https://host:port, or local (Default: local[*]).
  --deploy-mode DEPLOY_MODE   Whether to launch the driver program locally ("client") or
                              on one of the worker machines inside the cluster ("cluster")
                              (Default: client).
  --class CLASS_NAME          Your application's main class (for Java / Scala apps).
  --name NAME                 A name of your application.
  --jars JARS                 Comma-separated list of jars to include on the driver
                              and executor classpaths.
  --packages                  Comma-

In [2]:
import os
import s3fs
endpoint = "https://"+os.environ['AWS_S3_ENDPOINT']
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': endpoint})
event_log_path="pengfei/spark-history"

fs.touch('s3://'+event_log_path+'/.keep')
fs.info('pengfei/pengfei_test')


{'Key': 'pengfei/pengfei_test',
 'name': 'pengfei/pengfei_test',
 'type': 'directory',
 'Size': 0,
 'size': 0,
 'StorageClass': 'DIRECTORY'}

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession \
       .builder.master("k8s://https://kubernetes.default.svc:443") \
       .appName("Python Spark SQL basic example") \
       .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
       .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
       .config("spark.executor.instances", "5") \
       .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
       .config("spark.eventLog.enabled","true") \
       .config("spark.eventLog.dir","s3a://"+event_log_path) \
       .getOrCreate()



In [2]:
! kubectl get pods

NAME                                                      READY   STATUS      RESTARTS   AGE
deleting-pods-with-completed-status-1614956400-xwz65      0/1     Completed   0          16m
jupyter-1614950554-79cc9cbb97-tvlp4                       1/1     Running     0          113m
python-spark-sql-basic-example-a6b8a9780296bb96-exec-69   0/1     Error       0          2m39s
python-spark-sql-basic-example-a6b8a9780296bb96-exec-70   0/1     Error       0          2m39s
python-spark-sql-basic-example-a6b8a9780296bb96-exec-71   1/1     Running     0          2m9s
python-spark-sql-basic-example-a6b8a9780296bb96-exec-72   1/1     Running     0          2m9s
python-spark-sql-basic-example-a6b8a9780296bb96-exec-73   1/1     Running     0          2m9s
ubuntu-1612965548-79c9567b44-nhs9p                        1/1     Running     0          23d


In [7]:
data_path="s3a://pengfei/sspcloud-demo/data_format/Fire_Department_Calls_for_Service.csv"


In [8]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

fireSchema = StructType([StructField('CallNumber', IntegerType(), True),
                     StructField('UnitID', StringType(), True),
                     StructField('IncidentNumber', IntegerType(), True),
                     StructField('CallType', StringType(), True),                  
                     StructField('CallDate', StringType(), True),       
                     StructField('WatchDate', StringType(), True),       
                     StructField('ReceivedDtTm', StringType(), True),       
                     StructField('EntryDtTm', StringType(), True),       
                     StructField('DispatchDtTm', StringType(), True),       
                     StructField('ResponseDtTm', StringType(), True),       
                     StructField('OnSceneDtTm', StringType(), True),       
                     StructField('TransportDtTm', StringType(), True),                  
                     StructField('HospitalDtTm', StringType(), True),       
                     StructField('CallFinalDisposition', StringType(), True),       
                     StructField('AvailableDtTm', StringType(), True),       
                     StructField('Address', StringType(), True),       
                     StructField('City', StringType(), True),       
                     StructField('ZipcodeofIncident', IntegerType(), True),       
                     StructField('Battalion', StringType(), True),                 
                     StructField('StationArea', StringType(), True),       
                     StructField('Box', StringType(), True),       
                     StructField('OriginalPriority', StringType(), True),       
                     StructField('Priority', StringType(), True),       
                     StructField('FinalPriority', IntegerType(), True),       
                     StructField('ALSUnit', BooleanType(), True),       
                     StructField('CallTypeGroup', StringType(), True),
                     StructField('NumberofAlarms', IntegerType(), True),
                     StructField('UnitType', StringType(), True),
                     StructField('Unitsequenceincalldispatch', IntegerType(), True),
                     StructField('FirePreventionDistrict', StringType(), True),
                     StructField('SupervisorDistrict', StringType(), True),
                     StructField('NeighborhoodDistrict', StringType(), True),
                     StructField('Location', StringType(), True),
                     StructField('RowID', StringType(), True)])

In [9]:
df = spark.read.options(delimiter=',').schema(fireSchema).csv(data_path)

In [15]:
df.show(5)

+----------+-------+--------------+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+-------------+--------------------+--------------------+--------------------+-------------+-----------------+---------+------------+----+-----------------+--------+-------------+-------+---------------+--------------+---------+--------------------------+----------------------+-------------------+--------------------+-------------+--------------------+
|CallNumber| UnitID|IncidentNumber|            CallType|  CallDate| WatchDate|        ReceivedDtTm|           EntryDtTm|        DispatchDtTm|        ResponseDtTm|         OnSceneDtTm| TransportDtTm| HospitalDtTm|CallFinalDisposition|       AvailableDtTm|             Address|         City|ZipcodeofIncident|Battalion| StationArea| Box| OriginalPriority|Priority|FinalPriority|ALSUnit|  CallTypeGroup|NumberofAlarms| UnitType|Unitsequenceincalldispa

In [16]:
df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- ReceivedDtTm: string (nullable = true)
 |-- EntryDtTm: string (nullable = true)
 |-- DispatchDtTm: string (nullable = true)
 |-- ResponseDtTm: string (nullable = true)
 |-- OnSceneDtTm: string (nullable = true)
 |-- TransportDtTm: string (nullable = true)
 |-- HospitalDtTm: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- ZipcodeofIncident: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPr

In [37]:
def get_shape(df):
    row_num=df.count()
    col_num=len(df.columns)
    print("row numbers: %d, column numbers: %d" % (row_num,col_num))

In [38]:
%%time
get_shape(df)

row numbers: 5500520, column numbers: 34
CPU times: user 3.17 ms, sys: 0 ns, total: 3.17 ms
Wall time: 6.38 s


In [21]:
%%time
df.select("CallType").distinct().show(35,False)

+--------------------------------------------+
|CallType                                    |
+--------------------------------------------+
|Marine Fire                                 |
|Elevator / Escalator Rescue                 |
|Aircraft Emergency                          |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Lightning Strike (Investigation)            |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Oil Spill                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Train / Rail Fire                           |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire

In [40]:
%%time
from pyspark.sql.functions import col, asc,desc
df.select("CallType").groupBy("CallType").count().orderBy(col("count").desc()).show(35,False)

+--------------------------------------------+-------+
|CallType                                    |count  |
+--------------------------------------------+-------+
|Medical Incident                            |3596332|
|Structure Fire                              |681179 |
|Alarms                                      |599263 |
|Traffic Collision                           |224909 |
|Other                                       |87468  |
|Citizen Assist / Service Call               |82173  |
|Outside Fire                                |68491  |
|Water Rescue                                |28253  |
|Vehicle Fire                                |25512  |
|Gas Leak (Natural and LP Gases)             |22961  |
|Electrical Hazard                           |16872  |
|Elevator / Escalator Rescue                 |14840  |
|Odor (Strange / Unknown)                    |12994  |
|Smoke Investigation (Outside)               |12552  |
|Fuel Spill                                  |6237   |
|HazMat   

In [29]:
df.rdd.getNumPartitions()

In [32]:
df.repartition(10).createOrReplaceTempView("fireServiceVIEW")
spark.catalog.cacheTable("fireServiceVIEW")
# lazy eval, without action, table is not cached
spark.table("fireServiceVIEW").count()

5500520

In [33]:
spark.catalog.isCached("fireServiceVIEW")

True

In [35]:
cached_df = spark.table("fireServiceVIEW")

In [39]:
%%time
get_shape(cached_df)

row numbers: 5500520, column numbers: 34
CPU times: user 2.79 ms, sys: 0 ns, total: 2.79 ms
Wall time: 148 ms


In [None]:
%%time
cached_df.select("CallType").groupBy("CallType").count().orderBy(col("count").desc()).show(35,False)

In [None]:
%%time
parquet_output_path="s3a://pengfei/sspcloud-demo/data_format/Fire_Department_Calls_for_Service.parquet"
df.write.mode('overwrite').parquet(parquet_output_path)

In [None]:
# stop sparksession
spark.stop