### Importing libraries and initializing Spark context

In [1]:
import findspark
findspark.init('/usr/local/spark')
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.executor.memory","25g").config("spark.driver.memory","25g").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","32g").getOrCreate()

### Loading .csv files into individual dataframes

In [2]:
%%time
filePath_1gb = "./CSV-Files/nasa_logs_1GB.csv"
df_1gb = spark.read.format('csv').option("header","false").option("inferSchema","true").load(filePath_1gb)

CPU times: user 4.65 ms, sys: 541 µs, total: 5.2 ms
Wall time: 15.5 s


### Displaying total number of loaded records in each dataframe

In [3]:
%%time
df_1gb.count()

CPU times: user 1.69 ms, sys: 195 µs, total: 1.89 ms
Wall time: 2.26 s


13846448

### Renaming column names into meaningful names

In [4]:
df_1gb = df_1gb.withColumnRenamed("_c0","host") \
                .withColumnRenamed("_c1","method") \
                .withColumnRenamed("_c2","endpoint") \
                .withColumnRenamed("_c3","protocol") \
                .withColumnRenamed("_c4","status") \
                .withColumnRenamed("_c5","object_size") \
                .withColumnRenamed("_c6","timestamp")

### Converting dataframes into Parquet files

In [5]:
# df_1gb.write.parquet("nasa_logs_1GB.parquet")

### Loading Parquet files into dataframes to be able to query them

In [6]:
%%time
prqPath_1gb = spark.read.parquet("./Parquet-Files/nasa_logs_1GB.parquet")

CPU times: user 2 ms, sys: 238 µs, total: 2.24 ms
Wall time: 1.18 s


### Creating a view from dataframes to a meaningful name that can be used in the queries

In [7]:
prqPath_1gb.createOrReplaceTempView("http_logs_prq_1gb")

### Query 1: Count the number of records

In [8]:
%%time
query1_1gb = spark.sql("select count(*) AS TOTAL_RECORDS from http_logs_prq_1gb")
query1_1gb.show()

+-------------+
|TOTAL_RECORDS|
+-------------+
|     13846448|
+-------------+

CPU times: user 1.74 ms, sys: 0 ns, total: 1.74 ms
Wall time: 890 ms


### Query 2: 

In [9]:
%%time
query2_1gb = spark.sql("SELECT endpoint, COUNT(*) AS page_view_count FROM http_logs_prq_1gb \
                        GROUP BY endpoint \
                        ORDER BY page_view_count DESC LIMIT 5")
query2_1gb.show()

+--------------------+---------------+
|            endpoint|page_view_count|
+--------------------+---------------+
|/images/NASA-logo...|         834856|
|/images/KSC-logos...|         659880|
|/images/MOSAIC-lo...|         511632|
|/images/USA-logos...|         508296|
|/images/WORLD-log...|         503700|
+--------------------+---------------+

CPU times: user 0 ns, sys: 3.09 ms, total: 3.09 ms
Wall time: 3.34 s


### Query 3:

In [10]:
%%time
query3_1gb = spark.sql("SELECT status, count(status) AS distinct_status FROM http_logs_prq_1gb \
                        WHERE status >= '400' \
                        GROUP BY status \
                        ORDER BY distinct_status DESC")
query3_1gb.show()

+------+---------------+
|status|distinct_status|
+------+---------------+
|   404|          83596|
|   403|            900|
|   500|            260|
|   501|            164|
|   400|             60|
+------+---------------+

CPU times: user 2.69 ms, sys: 0 ns, total: 2.69 ms
Wall time: 1.23 s


### Query 4:

In [11]:
%%time
query4_1gb = spark.sql("SELECT endpoint, count(endpoint) AS count_of_requests \
                        FROM http_logs_prq_1gb WHERE status >= '400' \
                        GROUP BY endpoint \
                        ORDER BY count_of_requests DESC \
                        LIMIT 5")
query4_1gb.show()

+--------------------+-----------------+
|            endpoint|count_of_requests|
+--------------------+-----------------+
|/pub/winvn/readme...|             8016|
|/pub/winvn/releas...|             6928|
|/shuttle/missions...|             2732|
|/shuttle/missions...|             1712|
|/history/apollo/a...|             1536|
+--------------------+-----------------+

CPU times: user 2.58 ms, sys: 0 ns, total: 2.58 ms
Wall time: 1.45 s


### Query 5:

In [12]:
%%time
query5_1gb = spark.sql("SELECT DISTINCT(endpoint), timestamp, ROUND((object_size * 0.000001)) AS SIZE_IN_MB \
                        FROM http_logs_prq_1gb \
                        ORDER BY SIZE_IN_MB DESC \
                        LIMIT 20")
query5_1gb.show()

+--------------------+-------------------+----------+
|            endpoint|          timestamp|SIZE_IN_MB|
+--------------------+-------------------+----------+
|/shuttle/countdow...|1995-07-07 14:03:32|         7|
|/statistics/1995/...| 1995-07-09 9:18:44|         3|
|/statistics/1995/...|1995-08-11 11:16:21|         3|
|/statistics/1995/...|1995-08-07 18:28:57|         3|
|/statistics/1995/...|1995-08-21 14:21:16|         3|
|/mdss/ped/acs/SDP.ps|1995-07-11 17:29:34|         3|
|/statistics/1995/...|1995-08-03 15:51:23|         3|
|/statistics/1995/...|1995-08-21 10:54:33|         3|
|/statistics/1995/...|1995-07-07 10:28:56|         3|
|/statistics/1995/...|1995-07-06 10:19:00|         3|
|/statistics/1995/...| 1995-07-28 9:13:09|         3|
|/statistics/1995/...| 1995-07-05 8:57:07|         3|
|/statistics/1995/...|1995-08-17 10:13:42|         3|
|/statistics/1995/...|1995-07-06 12:11:57|         3|
|/statistics/1995/...|1995-07-05 17:21:54|         3|
|/statistics/1995/...| 1995-