In [1]:
# configure spark variables
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession

In [35]:
from pyspark.sql.functions import regexp_extract

In [4]:
import glob

In [3]:
# load up other dependencies
import re
import pandas as pd

In [2]:
sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

In [5]:
raw_data_files = glob.glob('*.gz')
raw_data_files

['NASA_access_log_Jul95.gz', 'NASA_access_log_Aug95.gz']

In [53]:
base_df = spark.read.text("test")

In [54]:
base_df.printSchema()

root
 |-- value: string (nullable = true)



In [98]:
type(base_df)

pyspark.sql.dataframe.DataFrame

In [99]:
base_df_rdd = base_df.rdd
type(base_df_rdd)

pyspark.rdd.RDD

In [100]:
base_df.show(10, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                     |
+--------------------------------------------------------------------------------------------------------------------------+
|in24.inetnebr.com - - [01/Aug/1995:00:00:01 -0400] "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0" 200 1839|
|uplherc.upl.com - - [01/Aug/1995:00:00:07 -0400] "GET / HTTP/1.0" 304 0                                                   |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/ksclogo-medium.gif HTTP/1.0" 304 0                          |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/MOSAIC-logosmall.gif HTTP/1.0" 304 0                        |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/USA-logosmall.gif HTTP/1.0" 304 0                           |


In [103]:
for item in base_df.foreach():
    print(item["value"])

TypeError: foreach() missing 1 required positional argument: 'f'

In [104]:
sample_logs = [item['value'] for item in base_df.take(15)]

In [108]:
len(sample_logs)

15

In [59]:
host_pattern = r'(^\S+\.[\S+\.]+\S+)\s'
hosts = [re.search(host_pattern, item).group(1)
           if re.search(host_pattern, item)
           else 'no match'
           for item in sample_logs]

In [60]:
ts_pattern = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
timestamps = [re.search(ts_pattern, item).group(1) for item in sample_logs]

In [61]:
method_uri_protocol_pattern = r'\"(\S+)\s(\S+)\s*(\S*)\"'
method_uri_protocol = [re.search(method_uri_protocol_pattern, item).groups()
               if re.search(method_uri_protocol_pattern, item)
               else 'no match'
              for item in sample_logs]

In [62]:
status_pattern = r'\s(\d{3})\s'
status = [re.search(status_pattern, item).group(1) for item in sample_logs]

In [63]:
content_size_pattern = r'\s(\d+)$'
content_size = [re.search(content_size_pattern, item).group(1) for item in sample_logs]

In [64]:
logs_df = base_df.select(regexp_extract('value', host_pattern, 1).alias('host'),
                         regexp_extract('value', ts_pattern, 1).alias('timestamp'),
                         regexp_extract('value', method_uri_protocol_pattern, 1).alias('method'),
                         regexp_extract('value', method_uri_protocol_pattern, 2).alias('endpoint'),
                         regexp_extract('value', method_uri_protocol_pattern, 3).alias('protocol'),
                         regexp_extract('value', status_pattern, 1).cast('integer').alias('status'),
                         regexp_extract('value', content_size_pattern, 1).cast('integer').alias('content_size'))

In [84]:
logs_df.select("host").distinct().show()
print("A quantidade de hosts distintos: %s" % (logs_df.select("host").distinct().count()))

+--------------------+
|                host|
+--------------------+
|         d104.aa.net|
|ix-orl2-01.ix.net...|
|    dial22.lloyd.com|
|ppptky391.asahi-n...|
|      199.120.110.21|
|smyth-pc.moorecap...|
|     205.212.115.106|
|      129.94.144.152|
|unicomp6.unicomp.net|
|  alyssa.prodigy.com|
|ppp-mia-30.shadow...|
|waters-gw.starway...|
|  net-1-141.eden.com|
|        199.72.81.55|
|  burger.letters.com|
|      205.189.154.54|
+--------------------+

A quantidade de hosts distintos: 16
