In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import datetime

spark = (SparkSession.builder.appName("NASA logs").getOrCreate())

In [0]:
#Cargamos los datos
data_logs = spark.read.format("csv").option("header", "false").load("dbfs:/FileStore/shared_uploads/paula.roman@bosonit.com/access_log_Aug95")

data_logs.printSchema()

data_logs.show(truncate = False)

root
 |-- _c0: string (nullable = true)

+-------------------------------------------------------------------------------------------------------------------------------+
|_c0                                                                                                                            |
+-------------------------------------------------------------------------------------------------------------------------------+
|in24.inetnebr.com - - [01/Aug/1995:00:00:01 -0400] "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0" 200 1839     |
|uplherc.upl.com - - [01/Aug/1995:00:00:07 -0400] "GET / HTTP/1.0" 304 0                                                        |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/ksclogo-medium.gif HTTP/1.0" 304 0                               |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/MOSAIC-logosmall.gif HTTP/1.0" 304 0                             |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400]

In [0]:
clean_logs = data_logs.select(F.regexp_extract("_c0",'(.*?)(\s+)',1).alias("host"), F.regexp_extract("_c0",'\[(.*?)(\s+)(.*?)\](\s+)',1).alias("date"),\
                             F.regexp_extract("_c0", '\"(.*?)(\s+)(.*?)(\s+)(.*?)\"',1).alias("request_method"), \
                             F.regexp_extract("_c0", '\"(.*?)(\s+)(.*?)(\s+)(.*?)\"',3).alias("resource"),\
                             F.regexp_extract("_c0", '\"(.*?)(\s+)(.*?)(\s+)(.*?)\"',5).alias("protocol"),\
                             F.regexp_extract("_c0", '(\s+)(\d+)',2).alias("http_status"), F.regexp_extract("_c0", '(\s+)(\d+)$',2).alias("size").cast("int"))

clean_logs.show(truncate=False)

clean_logs.printSchema()

+---------------------------+--------------------+--------------+---------------------------------------------------+--------+-----------+-----+
|host                       |date                |request_method|resource                                           |protocol|http_status|size |
+---------------------------+--------------------+--------------+---------------------------------------------------+--------+-----------+-----+
|in24.inetnebr.com          |01/Aug/1995:00:00:01|GET           |/shuttle/missions/sts-68/news/sts-68-mcc-05.txt    |HTTP/1.0|200        |1839 |
|uplherc.upl.com            |01/Aug/1995:00:00:07|GET           |/                                                  |HTTP/1.0|304        |0    |
|uplherc.upl.com            |01/Aug/1995:00:00:08|GET           |/images/ksclogo-medium.gif                         |HTTP/1.0|304        |0    |
|uplherc.upl.com            |01/Aug/1995:00:00:08|GET           |/images/MOSAIC-logosmall.gif                       |HTTP/1.0|304 

In [0]:
#¿Cuales son los distintos protocolos web?
nasa_protocols = clean_logs.select(F.col("protocol")).distinct()
nasa_protocols.show(truncate=False)

+---------------------------------------------------------------+
|protocol                                                       |
+---------------------------------------------------------------+
|Shield HTTP/1.0                                                |
|home.html HTTP/1.0                                             |
|history/apollo/apollo-13/apollo-13.html HTTP/1.0               |
|pictures HTTP/1.0                                              |
|                                                               |
|HTTP/V1.0                                                      |
|HTTP/1.0                                                       |
|images/ssbuv1.gif SRC=                                         |
|/   HTTP/1.0                                                   |
|apollo-1 HTTP/1.0                                              |
|HTML/1.0 headers                                               |
|Imaging Radar-C HTTP/1.0                                       |
|history/a

In [0]:
#Cuáles son los códigos de estado más comunes en la web? Agrúpalos y ordénalos para ver cuál es el más común.
nasa_status = clean_logs.groupBy("http_status").count().orderBy("count", ascending =False)

nasa_status.show(truncate=False)

+-----------+-------+
|http_status|count  |
+-----------+-------+
|200        |1398915|
|304        |134146 |
|           |22744  |
|404        |10041  |
|302        |3836   |
|403        |171    |
|501        |27     |
|400        |8      |
|500        |3      |
|13         |3      |
|40         |2      |
|2          |2      |
+-----------+-------+



In [0]:
#¿Y los métodos de petición (verbos) más utilizados?
nasa_requests = clean_logs.groupBy("request_method").count().orderBy("count", ascending = False)
nasa_requests.show(truncate = False)

+--------------+-------+
|request_method|count  |
+--------------+-------+
|GET           |1541307|
|              |24517  |
|HEAD          |3961   |
|POST          |111    |
|���.�         |2      |
+--------------+-------+



In [0]:
#¿Qué recurso tuvo la mayor transferencia de bytes de la página web? Además, queremos saber que recurso de nuestra web es el que más tráfico recibe. Es decir, el recurso con más registros en nuestro log.
nasa_size = clean_logs.select("resource", "size").distinct().orderBy("size", ascending = False)
nasa_size.show(100, truncate = False)

+------------------------------------------------------------+-------+
|resource                                                    |size   |
+------------------------------------------------------------+-------+
|/statistics/1995/Jul/Jul95_reverse_domains.html             |3421948|
|/statistics/1995/bkup/Mar95_full.html                       |3155499|
|/statistics/1995/bkup/Mar95_full.html                       |2981888|
|/statistics/1995/bkup/Mar95_full.html                       |2220032|
|/statistics/1995/Jul/Jul95_reverse_domains.html             |2015232|
|/statistics/1995/bkup/Apr95_full.html                       |1969293|
|/statistics/1995/Jul/Jul95_reverse_domains.html             |1943014|
|/statistics/1995/bkup/Mar95_full.html                       |1925120|
|/statistics/1995/bkup/Apr95_full.html                       |1769472|
|/statistics/1995/bkup/Feb95_full.html                       |1767078|
|/statistics/1995/bkup/Apr95_full.html                       |1613824|
|/stat

In [0]:
#¿Qué días la web recibió más tráfico?
nasa_days = clean_logs.groupBy("date").count().orderBy("count", ascending = False)
nasa_days.show(200, truncate = False)

+--------------------+-----+
|date                |count|
+--------------------+-----+
|                    |36   |
|29/Aug/1995:11:11:47|18   |
|01/Aug/1995:08:18:28|17   |
|15/Aug/1995:14:56:55|16   |
|15/Aug/1995:15:00:36|16   |
|10/Aug/1995:19:20:46|15   |
|30/Aug/1995:11:18:22|15   |
|10/Aug/1995:11:14:15|14   |
|03/Aug/1995:15:07:53|14   |
|30/Aug/1995:11:02:35|14   |
|11/Aug/1995:21:02:07|14   |
|28/Aug/1995:22:21:46|14   |
|04/Aug/1995:11:58:20|14   |
|22/Aug/1995:13:33:52|14   |
|21/Aug/1995:11:09:45|13   |
|22/Aug/1995:14:27:08|13   |
|31/Aug/1995:08:41:36|13   |
|29/Aug/1995:15:38:05|13   |
|03/Aug/1995:18:15:40|13   |
|14/Aug/1995:09:51:31|13   |
|03/Aug/1995:13:58:47|13   |
|10/Aug/1995:19:20:45|13   |
|01/Aug/1995:14:14:25|12   |
|31/Aug/1995:18:10:40|12   |
|30/Aug/1995:16:24:02|12   |
|15/Aug/1995:13:14:29|12   |
|09/Aug/1995:15:06:52|12   |
|08/Aug/1995:15:52:31|12   |
|10/Aug/1995:12:52:18|12   |
|28/Aug/1995:18:55:38|12   |
|15/Aug/1995:12:05:06|12   |
|18/Aug/1995:1

In [0]:
#¿Cuáles son los hosts son los más frecuentes?
nasa_hosts = clean_logs.select("host").groupBy("host").count().orderBy("count", ascending = False)
nasa_hosts.show(100, truncate = False)

+----------------------------------+-----+
|host                              |count|
+----------------------------------+-----+
|edams.ksc.nasa.gov                |6530 |
|piweba4y.prodigy.com              |4846 |
|163.206.89.4                      |4791 |
|piweba5y.prodigy.com              |4607 |
|piweba3y.prodigy.com              |4416 |
|www-d1.proxy.aol.com              |3889 |
|www-b2.proxy.aol.com              |3534 |
|www-b3.proxy.aol.com              |3463 |
|www-c5.proxy.aol.com              |3423 |
|www-b5.proxy.aol.com              |3411 |
|www-c2.proxy.aol.com              |3407 |
|www-d2.proxy.aol.com              |3404 |
|www-a2.proxy.aol.com              |3337 |
|news.ti.com                       |3298 |
|www-d3.proxy.aol.com              |3296 |
|www-b4.proxy.aol.com              |3293 |
|www-c3.proxy.aol.com              |3272 |
|www-d4.proxy.aol.com              |3234 |
|www-c1.proxy.aol.com              |3177 |
|www-c4.proxy.aol.com              |3134 |
|intgate.ra

In [0]:
#¿A qué horas se produce el mayor número de tráfico en la web?
split_col = F.split(clean_logs['date'], ':')
nasa_hours = clean_logs.withColumn('hours', split_col.getItem(1))
nasa_hours = nasa_hours.groupBy("hours").count().orderBy("count", ascending = False)
nasa_hours.show(truncate=False)

+-----+------+
|hours|count |
+-----+------+
|15   |109441|
|12   |105143|
|13   |104536|
|14   |101394|
|16   |99527 |
|11   |95344 |
|10   |88309 |
|17   |80834 |
|09   |78695 |
|18   |66809 |
|08   |65443 |
|22   |60673 |
|20   |59932 |
|19   |59315 |
|21   |57985 |
|23   |54570 |
|00   |47862 |
|07   |47386 |
|01   |38531 |
|02   |32508 |
+-----+------+
only showing top 20 rows



In [0]:
#¿Cuál es el número de errores 404 que ha habido cada día?
split_column = F.split(clean_logs["date"], "/")
nasa_errores = clean_logs.withColumn("dias", split_column.getItem(0))
nasa_errores = nasa_errores.select("dias", "http_status").filter("http_status == 404").groupBy("dias").count().orderBy("dias")
nasa_errores.show()

+----+-----+
|dias|count|
+----+-----+
|  01|  243|
|  03|  304|
|  04|  346|
|  05|  235|
|  06|  371|
|  07|  537|
|  08|  390|
|  09|  279|
|  10|  315|
|  11|  263|
|  12|  196|
|  13|  216|
|  14|  287|
|  15|  327|
|  16|  258|
|  17|  270|
|  18|  256|
|  19|  209|
|  20|  311|
|  21|  305|
+----+-----+
only showing top 20 rows

