In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
.master("local[*]") \
.appName("Pivot") \
.getOrCreate()

spark

In [3]:
log_df = spark.read.csv("./dataset/logdata.csv", header=True, inferSchema=True)
log_df.show(5)

+---+--------------------+--------+--------+--------------------+---------+------+-------------+---------+
|_c0|           Timestamp|LogLevel| Service|             Message|RequestID|  User|     ClientIP|TimeTaken|
+---+--------------------+--------+--------+--------------------+---------+------+-------------+---------+
|  1|2023-11-20 08:40:...|   DEBUG|ServiceA|            File I/O|     8684|User17|192.168.1.219|     55ms|
|  3|2023-11-20 08:40:...|   ERROR|ServiceA|     Critical Errors|     5821| User1|192.168.1.185|     72ms|
|  4|2023-11-20 08:40:...|   ERROR|ServiceB|     Critical Errors|     7272|User64|192.168.1.194|     56ms|
+---+--------------------+--------+--------+--------------------+---------+------+-------------+---------+
only showing top 5 rows



In [4]:
log_df.count()

100000

In [5]:
log_df = log_df.select(log_df.LogLevel, log_df.Timestamp)
log_df.show(5, False)

+--------+--------------------------+
|LogLevel|Timestamp                 |
+--------+--------------------------+
|DEBUG   |2023-11-20 08:40:50.672154|
|ERROR   |2023-11-20 08:40:50.688973|
|ERROR   |2023-11-20 08:40:50.697002|
+--------+--------------------------+
only showing top 5 rows



In [6]:
df1 = log_df.withColumn("month_num", month(col("Timestamp"))) \
    .withColumn("month", date_format(col("Timestamp"), "MMMM"))

df2 = df1.orderBy("month_num").drop("TimeStamp")

df2.show()

+--------+---------+--------+
|LogLevel|month_num|   month|
+--------+---------+--------+
|   FATAL|       11|November|
|   DEBUG|       11|November|
|   ERROR|       11|November|
|   ERROR|       11|November|
|   DEBUG|       11|November|
|   ERROR|       11|November|
|    INFO|       11|November|
|   FATAL|       11|November|
|   FATAL|       11|November|
|    INFO|       11|November|
|   ERROR|       11|November|
|   ERROR|       11|November|
+--------+---------+--------+
only showing top 20 rows



In [7]:
orders = ["INFO", "WARNING", "ERROR", "DEBUG", "FATAL"]
pivot_df = df2.groupBy("month").pivot("LogLevel", orders).count()
pivot_df.show()

+--------+-----+-------+-----+-----+-----+
+--------+-----+-------+-----+-----+-----+
|November|20142|  26181|15851|25901|11925|
+--------+-----+-------+-----+-----+-----+

