## Setup spark and install modules

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split, regexp_extract

In [13]:
spark = SparkSession.builder.appName("LogsAnalysis").getOrCreate()
spark.sparkContext.setLogLevel('ERROR')

## Practice

2. Create DataFrame representing the stream of input lines from connection to localhost:9999

In [14]:
access_lines = spark.readStream.text("../../mock_logs/log/apache_access")

3. Create another dataFrame that counts words from spliting a line

In [15]:
# Parse out the common log format to a DataFrame
contentSizeExp = r'\s(\d+)$'
statusExp = r'\s(\d{3})\s'
generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"'
timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}:\d{4})]'
hostExp = r'(^\S+\.[\S+\.]+\S+)\s'

logsDF = access_lines.select(regexp_extract('value', hostExp, 1).alias('host'),
                        regexp_extract('value', timeExp, 1).alias('timestamp'),
                        regexp_extract('value', generalExp, 1).alias('method'),
                        regexp_extract('value', generalExp, 2).alias('endpoint'),
                        regexp_extract('value', generalExp, 3).alias('protocol'),
                        regexp_extract('value', statusExp, 1).cast('integer').alias('status'),
                        regexp_extract('value', contentSizeExp, 1).cast('integer').alias('content_size'))

4. Start the query that prints the running counts to the console

To kill a Stream, interupt running for cell below

In [16]:
testDF=logsDF.groupBy("method").count()

In [17]:
query = testDF.writeStream.outputMode("complete").format("console").start()
query.awaitTermination()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+-----+
|method|count|
+------+-----+
| PATCH|    3|
|  POST|    5|
|DELETE|    2|
|   PUT|    3|
|   GET|    3|
+------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------+-----+
|method|count|
+------+-----+
| PATCH|   12|
|  POST|   19|
|DELETE|    8|
|   PUT|   11|
|   GET|   16|
+------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------+-----+
|method|count|
+------+-----+
| PATCH|   14|
|  POST|   19|
|DELETE|    9|
|   PUT|   11|
|   GET|   16|
+------+-----+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------+-----+
|method|count|
+------+-----+
| PATCH|   17|
|  POST|   24|
|DELETE|   15|
|   PUT|   14|
|   GET|   22|
+------+-----+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------+-----+
|method|count|
+------+-----+
| PATCH|   22|
|  POST|   27|
|DELETE|   23|
|   PUT|   17|
|   GET|   26|
+------+-----+

-------------------------------------------
Batch: 5
-------------------------------------------
+------+-----+
|method|count|
+------+-----+
| PATCH|   26|
|  POST|   35|
|DELETE|   26|
|   PUT|   24|
|   GET|   30|
+------+-----+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------+-----+
|method|count|
+------+-----+
| PATCH|   28|
|  POST|   36|
|DELETE|   29|
|   PUT|   27|
|   GET|   32|
+------+-----+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------+-----+
|method|count|
+------+-----+
| PATCH|   31|
|  POST|   37|
|DELETE|   35|
|   PUT|   33|
|   GET|   35|
+------+-----+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/mnestoro/SPARK/.venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/mnestoro/SPARK/.venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [16]:
query.stop()

In [17]:
spark.stop()