### **Schema on Read**

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, udf
import pandas as pd
import matplotlib.pyplot as plt
import re

In [4]:
spark = SparkSession\
        .builder\
        .appName('Schema on read')\
        .getOrCreate()

23/04/24 08:07:33 WARN Utils: Your hostname, natasha resolves to a loopback address: 127.0.1.1; using 192.168.1.188 instead (on interface wlo1)
23/04/24 08:07:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/24 08:07:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### *Loading the dataset*

In [5]:
df_log = spark.read.text('/home/natasha/Documents/Github/Spark-and-Data-Lakes/data/NASA_access_log_Jul95.gz')
df_log.printSchema()

root
 |-- value: string (nullable = true)



In [6]:
df_log.count()

                                                                                

1891715

In [8]:
df_log.show(5, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                  |
+-----------------------------------------------------------------------------------------------------------------------+
|199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245                                 |
|unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985                      |
|199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085   |
|burger.letters.com - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/countdown/liftoff.html HTTP/1.0" 304 0               |
|199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0" 200 4179|
+-----------------------

In [12]:
pd.set_option('display.max_colwidth', None)
df_log.limit(5).toPandas()

Unnamed: 0,value
0,"199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] ""GET /history/apollo/ HTTP/1.0"" 200 6245"
1,"unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985"
2,"199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] ""GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0"" 200 4085"
3,"burger.letters.com - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/countdown/liftoff.html HTTP/1.0"" 304 0"
4,"199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0"" 200 4179"


### *Parsing*

In [14]:
# trial 1
df_arrays = df_log.withColumn('tokenized', split('value', ' '))
df_arrays.limit(5).toPandas()

Unnamed: 0,value,tokenized
0,"199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] ""GET /history/apollo/ HTTP/1.0"" 200 6245","[199.72.81.55, -, -, [01/Jul/1995:00:00:01, -0400], ""GET, /history/apollo/, HTTP/1.0"", 200, 6245]"
1,"unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985","[unicomp6.unicomp.net, -, -, [01/Jul/1995:00:00:06, -0400], ""GET, /shuttle/countdown/, HTTP/1.0"", 200, 3985]"
2,"199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] ""GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0"" 200 4085","[199.120.110.21, -, -, [01/Jul/1995:00:00:09, -0400], ""GET, /shuttle/missions/sts-73/mission-sts-73.html, HTTP/1.0"", 200, 4085]"
3,"burger.letters.com - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/countdown/liftoff.html HTTP/1.0"" 304 0","[burger.letters.com, -, -, [01/Jul/1995:00:00:11, -0400], ""GET, /shuttle/countdown/liftoff.html, HTTP/1.0"", 304, 0]"
4,"199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0"" 200 4179","[199.120.110.21, -, -, [01/Jul/1995:00:00:11, -0400], ""GET, /shuttle/missions/sts-73/sts-73-patch-small.gif, HTTP/1.0"", 200, 4179]"


In [17]:
# trial 2
def parsing(line):
    pattern = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)'
    match = re.search(pattern, line)
    if match is None:
        return (line, 0)
    size_field = match.group(9)
    if size_field == '-':
        size = 0
    else:
        size = match.group(9)
    return {
        'host': match.group(1),
        'client_identity': match.group(1),
        'username': match.group(1),
        'date_time': match.group(1),
        'method': match.group(1),
        'endpoint': match.group(1),
        'protocol': match.group(1),
        'response_code': match.group(1),
        'content_size': size
    }

parsing_udf = udf(parsing)

In [18]:
df_parsed = df_log.withColumn('parsed', parsing_udf('value'))
df_parsed.limit(5).toPandas()

                                                                                

Unnamed: 0,value,parsed
0,"199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] ""GET /history/apollo/ HTTP/1.0"" 200 6245","{response_code=199.72.81.55, protocol=199.72.81.55, endpoint=199.72.81.55, content_size=6245, method=199.72.81.55, date_time=199.72.81.55, host=199.72.81.55, client_identity=199.72.81.55, username=199.72.81.55}"
1,"unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985","{response_code=unicomp6.unicomp.net, protocol=unicomp6.unicomp.net, endpoint=unicomp6.unicomp.net, content_size=3985, method=unicomp6.unicomp.net, date_time=unicomp6.unicomp.net, host=unicomp6.unicomp.net, client_identity=unicomp6.unicomp.net, username=unicomp6.unicomp.net}"
2,"199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] ""GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0"" 200 4085","{response_code=199.120.110.21, protocol=199.120.110.21, endpoint=199.120.110.21, content_size=4085, method=199.120.110.21, date_time=199.120.110.21, host=199.120.110.21, client_identity=199.120.110.21, username=199.120.110.21}"
3,"burger.letters.com - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/countdown/liftoff.html HTTP/1.0"" 304 0","{response_code=burger.letters.com, protocol=burger.letters.com, endpoint=burger.letters.com, content_size=0, method=burger.letters.com, date_time=burger.letters.com, host=burger.letters.com, client_identity=burger.letters.com, username=burger.letters.com}"
4,"199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0"" 200 4179","{response_code=199.120.110.21, protocol=199.120.110.21, endpoint=199.120.110.21, content_size=4179, method=199.120.110.21, date_time=199.120.110.21, host=199.120.110.21, client_identity=199.120.110.21, username=199.120.110.21}"
