In [0]:
#cargar el archivo como un archivo de texto normal y realizar las transformaciones pertinentes, a la hora de limpiar y estructurar nuestro dataset utilizaremos 
#expresiones regulares para recoger los campos que necesitamos 
df_logs = spark.read.text("/FileStore/tables/access_log_Aug95")
df_logs.show(truncate=False)
df_logs.printSchema()

+-------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                          |
+-------------------------------------------------------------------------------------------------------------------------------+
|in24.inetnebr.com - - [01/Aug/1995:00:00:01 -0400] "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0" 200 1839     |
|uplherc.upl.com - - [01/Aug/1995:00:00:07 -0400] "GET / HTTP/1.0" 304 0                                                        |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/ksclogo-medium.gif HTTP/1.0" 304 0                               |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/MOSAIC-logosmall.gif HTTP/1.0" 304 0                             |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/USA-logosmall.gif HTTP/1.0"

In [0]:
#expresiones regulares para separar columnas 
from pyspark.sql.functions import regexp_extract
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
from pyspark.sql.types import TimestampType

df_logs1 = df_logs.select(regexp_extract(F.col("value"), '(.*?)(\s+)',1).alias("Host"),
                          regexp_extract(F.col("value"), '\[(.*?)(\s+)(.*?)\](\s+)', 1).alias("Date"),
                          regexp_extract(F.col("value"), '\"(.*?)(\s+)(.*?)(\s+)(.*?)\"', 1).alias("Request Method"),
                          regexp_extract(F.col("value"), '\"(.*?)(\s+)(.*?)(\s+)(.*?)\"', 3).alias("Resource"),
                          regexp_extract(F.col("value"), '\"(.*?)(\s+)(.*?)(\s+)(.*?)\"', 5).alias("Protocol"),
                          regexp_extract(F.col("value"), '(\s+)(\d+)', 2).alias("HTTP status code").cast("int"),
                          regexp_extract(F.col("value"), '(\s+)(\d+)$', 2).alias("Size").cast("int"))


df_logs1.show(truncate=False)

+---------------------------+--------------------+--------------+---------------------------------------------------+--------+----------------+-----+
|Host                       |Date                |Request Method|Resource                                           |Protocol|HTTP status code|Size |
+---------------------------+--------------------+--------------+---------------------------------------------------+--------+----------------+-----+
|in24.inetnebr.com          |01/Aug/1995:00:00:01|GET           |/shuttle/missions/sts-68/news/sts-68-mcc-05.txt    |HTTP/1.0|200             |1839 |
|uplherc.upl.com            |01/Aug/1995:00:00:07|GET           |/                                                  |HTTP/1.0|304             |0    |
|uplherc.upl.com            |01/Aug/1995:00:00:08|GET           |/images/ksclogo-medium.gif                         |HTTP/1.0|304             |0    |
|uplherc.upl.com            |01/Aug/1995:00:00:08|GET           |/images/MOSAIC-logosmall.gif       

In [0]:
#transformar mes en número, primero hacemos un split
from pyspark.sql.functions import split
from pyspark.sql.functions import concat_ws
split_col = split(df_logs1['Date'], '/')
df2 = df_logs1.withColumn('day', split_col.getItem(0)) \
              .withColumn('month', split_col.getItem(1)) \
              .withColumn('year', split_col.getItem(2))
df3 = df2.withColumn("month1", F.when(F.col("month")=="Jan", "01")\
                                .when(F.col("month")=="Feb", "02")\
                                .when(F.col("month")=="Mar", "03")\
                                .when(F.col("month")=="Apr", "04")\
                                .when(F.col("month")=="May", "05")\
                                .when(F.col("month")=="Jun", "06")\
                                .when(F.col("month")=="Jul", "07")\
                                .when(F.col("month")=="Aug", "08")\
                                .when(F.col("month")=="Sep", "09")\
                                .when(F.col("month")=="Oct", "10")\
                                .when(F.col("month")=="Nov", "11")\
                                .when(F.col("month")=="Dec","12"))\
                                .drop("month")
df_web_server=df3.select("Host",concat_ws('/',df3.day,df3.month1,df3.year)
              .alias("Date"),"Request Method","Resource","Protocol","HTTP status code","Size")
df_web_server.show(truncate=False)

+---------------------------+-------------------+--------------+---------------------------------------------------+--------+----------------+-----+
|Host                       |Date               |Request Method|Resource                                           |Protocol|HTTP status code|Size |
+---------------------------+-------------------+--------------+---------------------------------------------------+--------+----------------+-----+
|in24.inetnebr.com          |01/08/1995:00:00:01|GET           |/shuttle/missions/sts-68/news/sts-68-mcc-05.txt    |HTTP/1.0|200             |1839 |
|uplherc.upl.com            |01/08/1995:00:00:07|GET           |/                                                  |HTTP/1.0|304             |0    |
|uplherc.upl.com            |01/08/1995:00:00:08|GET           |/images/ksclogo-medium.gif                         |HTTP/1.0|304             |0    |
|uplherc.upl.com            |01/08/1995:00:00:08|GET           |/images/MOSAIC-logosmall.gif              

In [0]:
#Guardaremos nuestro nuevo DataFrame ya estructurado en formato parquet. Y de este webleeremos para realizar nuestro análisis
df_web_server.write.parquet("df_web_server_logs")

In [0]:
#CONSULTAS
#¿Cuáles son los distintos protocolos web utilizados? Agrúpalos
df_web_server.select("Protocol").distinct().show()

+--------------------+
|            Protocol|
+--------------------+
|     Shield HTTP/1.0|
|  home.html HTTP/1.0|
|history/apollo/ap...|
|   pictures HTTP/1.0|
|40,207 89,234 HTT...|
|40,243 89,262 HTT...|
|                    |
|           HTTP/V1.0|
|            HTTP/1.0|
|images/ssbuv1.gif...|
|        /   HTTP/1.0|
|   apollo-1 HTTP/1.0|
|    HTML/1.0 headers|
|Imaging Radar-C H...|
|history/apollo/ap...|
|Island Spacefligh...|
|egress.html HTTP/1.0|
|    egress. HTTP/1.0|
|            Table of|
|           HTTP/1.0 |
+--------------------+
only showing top 20 rows



In [0]:
#¿Cuáles son los códigos de estado más comunes en la web? Agrúpalos y ordénalos para ver cuál es el más común
df_web_server.groupBy("HTTP status code").count().orderBy("count", ascending = False).show()

+----------------+-------+
|HTTP status code|  count|
+----------------+-------+
|             200|1398986|
|             304| 134146|
|             302|  26497|
|             404|  10053|
|             403|    171|
|             501|     27|
|             400|      8|
|             500|      3|
|              13|      3|
|              40|      2|
|               2|      2|
+----------------+-------+



In [0]:
#¿Y los métodos de petición (verbos) más utilizados?
df_web_server.groupBy("Request Method").count().orderBy("count", ascending = False).show()

+--------------+-------+
|Request Method|  count|
+--------------+-------+
|           GET|1564021|
|          HEAD|   3965|
|              |   1799|
|          POST|    111|
|         ���.�|      2|
+--------------+-------+



In [0]:
#¿Qué recurso tuvo la mayor transferencia de bytes de la página web?
from pyspark.sql.functions import desc
df_web_server.groupBy("Resource").agg(F.max("Size").alias("max size")).sort(desc("max size")).show()


+--------------------+--------+
|            Resource|max size|
+--------------------+--------+
|/statistics/1995/...| 3421948|
|/statistics/1995/...| 3155499|
|/statistics/1995/...| 1969293|
|/statistics/1995/...| 1767078|
|/shuttle/missions...| 1269716|
|/images/counthome...| 1239732|
|/shuttle/missions...| 1188908|
|/shuttle/missions...| 1182717|
|/statistics/1995/...| 1169007|
|/shuttle/missions...| 1121554|
|/shuttle/missions...| 1083124|
|/shuttle/missions...| 1082916|
|/shuttle/missions...| 1081049|
|/shuttle/movies/a...| 1065779|
|/shuttle/missions...| 1043093|
|/shuttle/missions...| 1030878|
|/shuttle/missions...| 1013716|
|/shuttle/missions...|  997451|
|/shuttle/missions...|  977801|
|/shuttle/missions...|  952257|
+--------------------+--------+
only showing top 20 rows



In [0]:
#Además, queremos saber que recurso de nuestra web es el que más tráfico recibe. Es decir, el recurso con más registros en nuestro log
df_web_server.groupBy("Resource").count().orderBy("count", ascending = False).show()

+--------------------+-----+
|            Resource|count|
+--------------------+-----+
|/images/NASA-logo...|97293|
|/images/KSC-logos...|75283|
|/images/MOSAIC-lo...|67356|
|/images/USA-logos...|66975|
|/images/WORLD-log...|66351|
|/images/ksclogo-m...|62670|
|           /ksc.html|43619|
|/history/apollo/i...|37806|
|/images/launch-lo...|35119|
|                   /|30123|
|/images/ksclogosm...|27789|
|/shuttle/missions...|24592|
| /shuttle/countdown/|24445|
|/shuttle/missions...|24363|
|/shuttle/missions...|23391|
|/shuttle/missions...|22429|
|/images/launchmed...|19863|
|  /htbin/cdt_main.pl|17238|
|/shuttle/countdow...|12154|
|     /icons/menu.xbm|12128|
+--------------------+-----+
only showing top 20 rows



In [0]:
# ¿Qué días la web recibió más tráfico?
split_col = split(df_web_server['Date'], ':')
df_dias = df_web_server.withColumn('day', split_col.getItem(0))
df_dias.groupBy("day").count().orderBy("count", acsending=False).show()


+----------+-----+
|       day|count|
+----------+-----+
|26/08/1995|31608|
|05/08/1995|31893|
|19/08/1995|32094|
|06/08/1995|32420|
|27/08/1995|32823|
|20/08/1995|32963|
|01/08/1995|33996|
|13/08/1995|36480|
|12/08/1995|38071|
|03/08/1995|41388|
|24/08/1995|52552|
|28/08/1995|55496|
|21/08/1995|55540|
|18/08/1995|56246|
|16/08/1995|56653|
|25/08/1995|57321|
|07/08/1995|57362|
|22/08/1995|57762|
|23/08/1995|58097|
|15/08/1995|58847|
+----------+-----+
only showing top 20 rows



In [0]:

#Cuáles son los hosts más frecuentes?
df_web_server.groupBy("Host").count().orderBy("count", ascending = False).show()

+--------------------+-----+
|                Host|count|
+--------------------+-----+
|  edams.ksc.nasa.gov| 6530|
|piweba4y.prodigy.com| 4846|
|        163.206.89.4| 4791|
|piweba5y.prodigy.com| 4607|
|piweba3y.prodigy.com| 4416|
|www-d1.proxy.aol.com| 3889|
|www-b2.proxy.aol.com| 3534|
|www-b3.proxy.aol.com| 3463|
|www-c5.proxy.aol.com| 3423|
|www-b5.proxy.aol.com| 3411|
|www-c2.proxy.aol.com| 3407|
|www-d2.proxy.aol.com| 3404|
|www-a2.proxy.aol.com| 3337|
|         news.ti.com| 3298|
|www-d3.proxy.aol.com| 3296|
|www-b4.proxy.aol.com| 3293|
|www-c3.proxy.aol.com| 3272|
|www-d4.proxy.aol.com| 3234|
|www-c1.proxy.aol.com| 3177|
|www-c4.proxy.aol.com| 3134|
+--------------------+-----+
only showing top 20 rows



In [0]:
#¿A qué horas se produce el mayor número de tráfico en la web?
df_horas = df_web_server.withColumn('hours', split_col.getItem(1))
df_horas.groupBy("hours").count().orderBy("count", acsending=False).show()

+-----+-----+
|hours|count|
+-----+-----+
|   04|26756|
|   05|27587|
|   03|29995|
|   06|31287|
|   02|32508|
|   01|38531|
|   07|47386|
|   00|47862|
|   23|54570|
|   21|57985|
|   19|59315|
|   20|59944|
|   22|60673|
|   08|65443|
|   18|66809|
|   09|78695|
|   17|80834|
|   10|88309|
|   11|95344|
|   16|99527|
+-----+-----+
only showing top 20 rows



In [0]:
#¿Cuál es el número de errores 404 que ha habido cada día?
not_found_df = df_dias.filter(F.col("HTTP status code") == 404).cache()
not_found_df.groupBy("day").count().orderBy("day").show()



+----------+-----+
|       day|count|
+----------+-----+
|01/08/1995|  243|
|03/08/1995|  304|
|04/08/1995|  346|
|05/08/1995|  236|
|06/08/1995|  373|
|07/08/1995|  537|
|08/08/1995|  391|
|09/08/1995|  279|
|10/08/1995|  315|
|11/08/1995|  263|
|12/08/1995|  196|
|13/08/1995|  216|
|14/08/1995|  287|
|15/08/1995|  327|
|16/08/1995|  259|
|17/08/1995|  271|
|18/08/1995|  256|
|19/08/1995|  209|
|20/08/1995|  312|
|21/08/1995|  305|
+----------+-----+
only showing top 20 rows

