In [1]:
import os.path
import shutil

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark


In [2]:
folder_path = '/Users/sg0218817/Downloads/Ex_Files_Spark_SQL_DataFrames/Exercise Files/Data'
file_path = os.path.join(folder_path, 'location_temp.csv')
df1 = spark.read.format('csv')\
    .option('header', 'true')\
    .load(file_path)

df1.show(10)
df1.count()


+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 20:08:06|       loc0|          27|
|03/04/2019 20:13:06|       loc0|          27|
|03/04/2019 20:18:06|       loc0|          27|
|03/04/2019 20:23:06|       loc0|          29|
|03/04/2019 20:28:06|       loc0|          32|
|03/04/2019 20:33:06|       loc0|          35|
+-------------------+-----------+------------+
only showing top 10 rows



500000

In [3]:
file_path = os.path.join(folder_path, 'utilization.csv')
df2 = spark.read.format('csv')\
    .option('header', 'false')\
    .option('inferSchema', 'true')\
    .load(file_path)

df2 = df2.withColumnRenamed('_c0', 'event_date') \
    .withColumnRenamed('_c1', 'server_id') \
    .withColumnRenamed('_c2', 'cpu_utilization') \
    .withColumnRenamed('_c3', 'free_memory') \
    .withColumnRenamed('_c4', 'session_count')

df2.show(10)
df2.printSchema()
df2.count()


+-------------------+---------+---------------+-----------+-------------+
|         event_date|server_id|cpu_utilization|free_memory|session_count|
+-------------------+---------+---------------+-----------+-------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|
|03/05/2019 08:31:14|      100|           0.41|       0.58|           48|
|03/05/2019 08:36:14|      100|           0.57|       0.35|           58|
|03/05/2019 08:41:14|      100|           0.41|        0.4|           58|
|03/05/2019 08:46:14|      100|           0.53|       0.35|           62|
|03/05/2019 08:51:14|      100|           0.51|        0.6|           45|
+-------------------+---------+-------

500000

In [4]:
file_path = os.path.join(folder_path, 'utilization')

shutil.rmtree(file_path)

df2.write.format('json')\
    .save(file_path)



In [5]:
file_path = os.path.join(folder_path, 'utilization', '*.json')
df3 = spark.read.format('json')\
    .load(file_path)

df3.show(10)


+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|         event_date|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
+---------------+-------------------+-

In [6]:
df3.columns


['cpu_utilization', 'event_date', 'free_memory', 'server_id', 'session_count']

In [7]:
df4 = df3.sample(False, fraction=0.05) \
    .filter(df3['session_count'] > 60) \
    .sort('cpu_utilization')

df4.show(10)


+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|         event_date|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.23|03/22/2019 10:01:25|       0.66|      106|           65|
|           0.23|03/31/2019 04:16:47|       0.44|      119|           67|
|           0.23|03/13/2019 19:21:47|        0.5|      119|           65|
|           0.24|04/05/2019 15:46:56|       0.39|      124|           66|
|           0.24|03/24/2019 20:22:22|       0.59|      138|           67|
|           0.24|03/07/2019 12:01:46|       0.75|      119|           64|
|           0.24|04/03/2019 12:42:23|       0.72|      138|           68|
|           0.24|03/11/2019 05:56:47|       0.46|      119|           63|
|           0.25|04/06/2019 00:01:26|       0.39|      106|           61|
|           0.25|03/24/2019 18:07:22|       0.39|      138|           65|
+---------------+-------------------+-

In [8]:
df5 = df1.groupBy('location_id') \
    .agg({'temp_celcius': 'mean'}) \
    .orderBy('location_id')

df5.show(10)


+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|       loc0|           29.176|
|       loc1|           28.246|
|      loc10|           25.337|
|     loc100|           27.297|
|     loc101|           25.317|
|     loc102|           30.327|
|     loc103|           25.341|
|     loc104|           26.204|
|     loc105|           26.217|
|     loc106|           27.201|
+-----------+-----------------+
only showing top 10 rows



In [9]:
! pwd


/Users/sg0218817/Private/IT/others/spark/src/main/python


In [10]:
! ls


_start-dataframe-api.ipynb parquet-connector.ipynb
_start-sql.ipynb           start-pi-calculation.ipynb
mongo-connector.ipynb
