### Importing libraries and initializing Spark context

In [None]:
import findspark
findspark.init('/usr/local/spark')
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.executor.memory","25g").config("spark.driver.memory","25g").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","32g").getOrCreate()

### Loading .csv files into individual dataframes

In [None]:
%%time
filePath_15gb = "../CSV-Files/nasa_logs_15GB.csv"
df_15gb = spark.read.format('csv').option("header","false").option("inferSchema","true").load(filePath_15gb)

### Displaying total number of loaded records in each dataframe

In [None]:
%%time
df_15gb.count()

### Renaming column names into meaningful names

In [None]:
df_15gb = df_15gb.withColumnRenamed("_c0","host") \
                .withColumnRenamed("_c1","method") \
                .withColumnRenamed("_c2","endpoint") \
                .withColumnRenamed("_c3","protocol") \
                .withColumnRenamed("_c4","status") \
                .withColumnRenamed("_c5","object_size") \
                .withColumnRenamed("_c6","timestamp")

### Converting dataframe into ORC file

In [None]:
# df_15gb.write.orc("nasa_logs_15GB.orc")

### Loading ORC file into dataframe to be able to query it

In [None]:
%%time
orcPath_15gb = spark.read.orc("./nasa_logs_15GB.orc")

### Creating a view from dataframe to a meaningful name that can be used in the queries

In [None]:
orcPath_15gb.createOrReplaceTempView("http_logs_orc_15gb")

### Query 1: Count the number of records

In [None]:
%%time
query1_15gb = spark.sql("select count(*) AS TOTAL_RECORDS from http_logs_orc_15gb")
query1_15gb.show()

### Query 2: 

In [None]:
%%time
query2_15gb = spark.sql("SELECT endpoint, COUNT(*) AS page_view_count FROM http_logs_orc_15gb \
                        GROUP BY endpoint \
                        ORDER BY page_view_count DESC LIMIT 5")
query2_15gb.show()

### Query 3:

In [None]:
%%time
query3_15gb = spark.sql("SELECT status, count(status) AS distinct_status FROM http_logs_orc_15gb \
                        WHERE status >= '400' \
                        GROUP BY status \
                        ORDER BY distinct_status DESC")
query3_15gb.show()

### Query 4:

In [None]:
%%time
query4_15gb = spark.sql("SELECT endpoint, count(endpoint) AS count_of_requests \
                        FROM http_logs_orc_15gb WHERE status >= '400' \
                        GROUP BY endpoint \
                        ORDER BY count_of_requests DESC \
                        LIMIT 5")
query4_15gb.show()

### Query 5:

In [None]:
%%time
query5_15gb = spark.sql("SELECT DISTINCT(endpoint), timestamp, ROUND((object_size * 0.000001)) AS SIZE_IN_MB \
                        FROM http_logs_orc_15gb \
                        ORDER BY SIZE_IN_MB DESC \
                        LIMIT 20")
query5_15gb.show()