## Setup spark and install modules

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split, regexp_extract

In [25]:
spark = SparkSession.builder.appName("LogsAnalysis").getOrCreate()
spark.sparkContext.setLogLevel('ERROR')

## Practice

2. Create DataFrame representing the stream of input lines from connection to localhost:9999

In [26]:
access_lines = spark.readStream.text("../../mock_logs/log/")

3. Create another dataFrame that counts words from spliting a line

In [22]:
# Parse out the common log format to a DataFrame
contentSizeExp = r'\s(\d+)$'
statusExp = r'\s(\d{3})\s'
generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"'
timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}:\d{4})]'
hostExp = r'(^\S+\.[\S+\.]+\S+)\s'

logsDF = access_lines.select(regexp_extract('value', hostExp, 1).alias('host'),
                        regexp_extract('value', timeExp, 1).alias('timestamp'),
                        regexp_extract('value', generalExp, 1).alias('method'),
                        regexp_extract('value', generalExp, 2).alias('endpoint'),
                        regexp_extract('value', generalExp, 3).alias('protocol'),
                        regexp_extract('value', statusExp, 1).cast('integer').alias('status'),
                        regexp_extract('value', contentSizeExp, 1).cast('integer').alias('content_size'))

4. Start the query that prints the running counts to the console

To kill a Stream, interupt running for cell below

In [23]:
query = logsDF.writeStream.outputMode("append").format("console").start()
query.awaitTermination()

-------------------------------------------
Batch: 0
-------------------------------------------
+---------------+---------+------+-----------+--------+------+------------+
|           host|timestamp|method|   endpoint|protocol|status|content_size|
+---------------+---------+------+-----------+--------+------+------------+
| 146.45.211.240|         |  POST|    /alerts|HTTP/1.1|   404|        4602|
|   108.2.48.142|         |  POST| /fieldsets|HTTP/1.1|   400|        2679|
|   6.253.35.155|         |  POST| /playbooks|HTTP/1.1|   300|        3301|
|  19.174.46.137|         |DELETE| /fieldsets|HTTP/1.1|   401|        3186|
|  55.158.14.253|         |DELETE|     /users|HTTP/1.1|   204|        3812|
|  99.233.29.169|         |   GET|/collectors|HTTP/1.1|   403|        2647|
| 170.249.30.208|         |  POST|/collectors|HTTP/1.1|   404|        4039|
|  91.198.138.32|         | PATCH|   /parsers|HTTP/1.1|   400|        2859|
|  39.71.167.104|         |   PUT|   /parsers|HTTP/1.1|   401|     

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/mnestoro/SPARK/.venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/mnestoro/SPARK/.venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [16]:
query.stop()

In [17]:
spark.stop()