In [1]:
import pyspark
from pyspark.sql import Row
from pyspark.sql import SparkSession
import re

sc = pyspark.SparkContext('local[*]')
spark = SparkSession(sc)

# example from https://community.hortonworks.com/articles/108622/pyspark-helper-function-to-parse-apache-logs.html

In [7]:

APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)?'
     
# Returns a dictionary containing the parts of the Apache Access Log.
def parse_apache_log_line(logline):
    match = re.search(APACHE_ACCESS_LOG_PATTERN, logline)
    if match is None:
        # Optionally, you can change this to just ignore if each line of data is not critical.
        # For this example, we want to ensure that the format is consistent.
        raise Exception("Invalid logline: %s" % logline)
        #next
    if match.group(9) in (None, "-"):
        size = 0
    else:
        size = match.group(9)
    return Row(
        ipAddress    = str(match.group(1)),
        clientIdentd = str(match.group(2)),
        userId       = str(match.group(3)),
        dateTime     = str(match.group(4)),
        method       = str(match.group(5)),
        endpoint     = str(match.group(6)),
        protocol     = str(match.group(7)),
        responseCode = int(match.group(8)),
        contentSize  = int(size)
    )
     
log_files = "file:///home/jovyan/work/access.log"
raw_log_files = sc.textFile(log_files)
raw_log_files.count()
     
parsed_log_files = raw_log_files.map(parse_apache_log_line)
parsed_log_files.toDF().registerTempTable("log_data")
parsed_log_files.count()



914724

In [11]:
sqlDF = spark.sql("SELECT count(*) as count, ipAddress FROM log_data GROUP BY ipAddress ORDER BY count DESC")
sqlDF.show()

+------+---------------+
| count|      ipAddress|
+------+---------------+
|167812| 198.50.156.189|
| 97533|   149.56.83.40|
| 33302| 205.167.170.15|
| 17904| 134.249.53.185|
| 16358|  178.159.37.81|
| 13474|192.227.172.158|
| 10996| 195.154.216.79|
| 10943|    158.64.79.7|
| 10612|    158.64.79.8|
|  7705|195.154.215.241|
|  6426|   37.59.222.30|
|  5994|  14.139.208.84|
|  5664|   41.75.96.130|
|  5662| 190.57.154.101|
|  5659|   154.0.14.250|
|  4629|    54.37.1.236|
|  4435|  85.10.113.129|
|  3780|   37.1.206.196|
|  3207|  79.142.95.122|
|  2840| 78.186.191.187|
+------+---------------+
only showing top 20 rows

