## Setup spark and install modules

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract

In [None]:
spark = SparkSession.builder.appName("LogsAnalysis").getOrCreate()

## Setup


### Generate data
For out practice purposes, we will generate mockup Apache Access logs.
>1. Open terminal and change directory to "Data" folder
>2. In terminal, run command `python log_generator apache_access_schema.yaml`, add flag `-t` to truncate previously created logs
>3. Based on provide .yaml file, new `log.txt.{id}` file should appear every 5s with 50 lines of random logs.

## Practice

1. Create DataFrame representing the stream of raw log data lines ariving to `/log/apache_access` directory

In [None]:
access_lines = spark.readStream.text("../../mock_logs/log/apache_access")

2. Parse log data to DataFrame

In [None]:
# Parse out the common log format to a DataFrame
contentSizeExp = r'\s(\d+)$'
statusExp = r'\s(\d{3})\s'
generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"'
timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}:\d{4})]'
hostExp = r'(^\S+\.[\S+\.]+\S+)\s'

logsDF = access_lines.select(regexp_extract('value', hostExp, 1).alias('host'),
                        regexp_extract('value', timeExp, 1).alias('timestamp'),
                        regexp_extract('value', generalExp, 1).alias('method'),
                        regexp_extract('value', generalExp, 2).alias('endpoint'),
                        regexp_extract('value', generalExp, 3).alias('protocol'),
                        regexp_extract('value', statusExp, 1).cast('integer').alias('status'),
                        regexp_extract('value', contentSizeExp, 1).cast('integer').alias('content_size'))

3. Count every access status code

In [None]:
statusCountsDF=logsDF.groupBy("status").count()

4. Kick off our streaming query, dumping results to the console

In [None]:
query = statusCountsDF.writeStream.outputMode("complete").format("console").queryName("status_counts").start()
query.awaitTermination()

5. Classroom cleanup
IMPORTANT: Kill running log_generator script in terminal

In [None]:
query.stop()
spark.stop()