In [16]:
from pyspark.sql import SparkSession

In [17]:
spark = SparkSession.builder.getOrCreate()

In [18]:
data_path = '/Users/natha/Desktop/bootcamp_repo-1/LinkedIn_Spark_SQL_DataFrames/Exercise Files/Data'
file_path_no_header = data_path + "/utilization.csv"
df = spark.read.format("csv").option("header", "false").option("inferSchema","true").load(file_path_no_header)

df_util = df.withColumnRenamed("_c0", "event_datetime") \
        .withColumnRenamed ("_c1", "server_id")       \
        .withColumnRenamed("_c2", "cpu_utilization")  \
        .withColumnRenamed("_c3", "free_memory")      \
        .withColumnRenamed("_c4", "session_count")

In [19]:
df_util.show(10)

+-------------------+---------+---------------+-----------+-------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|
+-------------------+---------+---------------+-----------+-------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|
|03/05/2019 08:31:14|      100|           0.41|       0.58|           48|
|03/05/2019 08:36:14|      100|           0.57|       0.35|           58|
|03/05/2019 08:41:14|      100|           0.41|        0.4|           58|
|03/05/2019 08:46:14|      100|           0.53|       0.35|           62|
|03/05/2019 08:51:14|      100|           0.51|        0.6|           45|
+-------------------+---------+-------

In [20]:
df_util.createOrReplaceTempView("utilization")

In [21]:
df_util.count()

500000

In [22]:
df_util.describe().show()

+-------+-------------------+------------------+-------------------+-------------------+------------------+
|summary|     event_datetime|         server_id|    cpu_utilization|        free_memory|     session_count|
+-------+-------------------+------------------+-------------------+-------------------+------------------+
|  count|             500000|            500000|             500000|             500000|            500000|
|   mean|               null|             124.5| 0.6205177400000115|0.37912809999999625|          69.59616|
| stddev|               null|14.430884120553204|0.15875173872912818|0.15830931278376212|14.850676696352831|
|    min|03/05/2019 08:06:14|               100|               0.22|                0.0|                32|
|    max|04/09/2019 01:22:46|               149|                1.0|               0.78|               105|
+-------+-------------------+------------------+-------------------+-------------------+------------------+



In [8]:
df_util.describe('cpu_utilization','free_memory').show()

+-------+-------------------+-------------------+
|summary|    cpu_utilization|        free_memory|
+-------+-------------------+-------------------+
|  count|             500000|             500000|
|   mean| 0.6205177400000115|0.37912809999999625|
| stddev|0.15875173872912818|0.15830931278376212|
|    min|               0.22|                0.0|
|    max|                1.0|               0.78|
+-------+-------------------+-------------------+



In [23]:
df_util.stat.corr('cpu_utilization','free_memory')

-0.47047715730807216

In [24]:
df_util.stat.corr('session_count','free_memory')

-0.5008320848876588

In [25]:
df_util.stat.freqItems(('server_id','session_count')).show()

+--------------------+-----------------------+
| server_id_freqItems|session_count_freqItems|
+--------------------+-----------------------+
|[146, 137, 101, 1...|   [92, 101, 83, 104...|
+--------------------+-----------------------+



In [26]:
df_util_sample = df_util.sample(fraction=0.05, withReplacement=False)
df_util_sample.count()

24622

In [27]:
spark.sql('SELECT min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) FROM utilization').show()

+--------------------+--------------------+-----------------------+
|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+--------------------+--------------------+-----------------------+
|                0.22|                 1.0|    0.15875173872912818|
+--------------------+--------------------+-----------------------+



In [28]:
spark.sql('SELECT server_id, min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) \
           FROM utilization \
           GROUP BY server_id').show()

+---------+--------------------+--------------------+-----------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+---------+--------------------+--------------------+-----------------------+
|      108|                0.55|                0.95|    0.11563100171171926|
|      101|                 0.6|                 1.0|    0.11651726263197697|
|      103|                0.56|                0.96|    0.11617507884178278|
|      111|                0.36|                0.76|    0.11530221569464483|
|      107|                0.45|                0.85|    0.11597417369783877|
|      100|                0.27|                0.67|     0.1152264191787964|
|      102|                0.56|                0.96|    0.11549678751286807|
|      109|                0.36|                0.76|    0.11574898623219722|
|      105|                0.29|                0.69|    0.11510721467869486|
|      110|                0.35|                0.75|    0.11533

In [29]:
spark.sql('SELECT server_id, FLOOR(cpu_utilization*100/10) bucket FROM utilization').show()

+---------+------+
|server_id|bucket|
+---------+------+
|      100|     5|
|      100|     4|
|      100|     5|
|      100|     5|
|      100|     3|
|      100|     4|
|      100|     5|
|      100|     4|
|      100|     5|
|      100|     5|
|      100|     3|
|      100|     6|
|      100|     6|
|      100|     5|
|      100|     2|
|      100|     4|
|      100|     4|
|      100|     6|
|      100|     4|
|      100|     5|
+---------+------+
only showing top 20 rows



In [31]:
spark.sql('SELECT count(*), FLOOR(cpu_utilization*100/10) bucket FROM utilization GROUP BY bucket ORDER BY bucket').show()

+--------+------+
|count(1)|bucket|
+--------+------+
|    8186|     2|
|   37029|     3|
|   68046|     4|
|  104910|     5|
|  116725|     6|
|   88242|     7|
|   56598|     8|
|   20207|     9|
|      57|    10|
+--------+------+

