<a href="https://colab.research.google.com/github/owgee/big-data/blob/main/NASA_Log_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### NASA Log Analysis


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder.appName("NASA_Log_Analysis").getOrCreate()

# Read data from the CSV file in HDFS
file_path = "/hdfs/data.csv"
weblog_df = spark.read.csv(file_path, header=True, inferSchema=True)

24/04/02 00:59:44 WARN Utils: Your hostname, OGs-M2-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.4.6 instead (on interface en0)
24/04/02 00:59:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/02 00:59:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [None]:
# Show schema and sample data
weblog_df.printSchema()
weblog_df.show(5)

root
 |-- _c0: integer (nullable = true)
 |-- host: string (nullable = true)
 |-- time: integer (nullable = true)
 |-- method: string (nullable = true)
 |-- url: string (nullable = true)
 |-- response: integer (nullable = true)
 |-- bytes: integer (nullable = true)

+---+-----------+---------+------+--------------------+--------+-----+
|_c0|       host|     time|method|                 url|response|bytes|
+---+-----------+---------+------+--------------------+--------+-----+
|  0|***.novo.dk|805465029|   GET|           /ksc.html|     200| 7067|
|  1|***.novo.dk|805465031|   GET|/images/ksclogo-m...|     200| 5866|
|  2|***.novo.dk|805465051|   GET|/images/MOSAIC-lo...|     200|  363|
|  3|***.novo.dk|805465053|   GET|/images/USA-logos...|     200|  234|
|  4|***.novo.dk|805465054|   GET|/images/NASA-logo...|     200|  786|
+---+-----------+---------+------+--------------------+--------+-----+
only showing top 5 rows



24/04/02 01:02:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , host, time, method, url, response, bytes
 Schema: _c0, host, time, method, url, response, bytes
Expected: _c0 but found: 
CSV file: file:///Users/owdengodson/Documents/data.csv


#### Top-5 hosts accessing the most files

In [None]:
top_hosts_files = weblog_df.groupBy("host").count().orderBy(col("count").desc()).limit(5)
top_hosts_files.show(truncate=False)

[Stage 5:>                                                          (0 + 3) / 3]

+--------------------+-----+
|host                |count|
+--------------------+-----+
|piweba3y.prodigy.com|19258|
|piweba4y.prodigy.com|14512|
|edams.ksc.nasa.gov  |11472|
|piweba1y.prodigy.com|10934|
|163.206.89.4        |7859 |
+--------------------+-----+



                                                                                

#### Top-5 hosts accessing the most amount of data

In [None]:
top_hosts_data = weblog_df.groupBy("host").agg({"bytes": "sum"}).withColumnRenamed("sum(bytes)", "total_bytes").orderBy(col("total_bytes").desc()).limit(5)
top_hosts_data.show(truncate=False)

[Stage 7:>                                                        (0 + 10) / 10]

+--------------------+-----------+
|host                |total_bytes|
+--------------------+-----------+
|piweba3y.prodigy.com|402596858  |
|piweba4y.prodigy.com|247448379  |
|piweba1y.prodigy.com|236973233  |
|alyssa.prodigy.com  |177885291  |
|news.ti.com         |157004829  |
+--------------------+-----------+



                                                                                

#### Top-5 files that are most frequently accessed

In [None]:
top_files_accessed = weblog_df.groupBy("url").count().orderBy(col("count").desc()).limit(5)
top_files_accessed.show(truncate=False)



+----------------------------+------+
|url                         |count |
+----------------------------+------+
|/images/NASA-logosmall.gif  |208798|
|/images/KSC-logosmall.gif   |164976|
|/images/MOSAIC-logosmall.gif|127916|
|/images/USA-logosmall.gif   |127082|
|/images/WORLD-logosmall.gif |125933|
+----------------------------+------+



                                                                                

#### Top-5 files contributing to the most internet traffic

In [None]:
top_files_traffic = weblog_df.groupBy("url").agg({"bytes": "sum"}).withColumnRenamed("sum(bytes)", "total_bytes").orderBy(col("total_bytes").desc()).limit(5)
top_files_traffic.show(truncate=False)

spark.stop()

                                                                                

+------------------------------------------------------------+-----------+
|url                                                         |total_bytes|
+------------------------------------------------------------+-----------+
|/shuttle/missions/sts-71/movies/sts-71-launch.mpg           |3196457118 |
|/shuttle/missions/sts-71/movies/sts-71-mir-dock.mpg         |1409035595 |
|/shuttle/missions/sts-71/movies/sts-71-tcdt-crew-walkout.mpg|1137114616 |
|/shuttle/missions/sts-70/movies/sts-70-launch.mpg           |1098853893 |
|/shuttle/technology/sts-newsref/stsref-toc.html             |1061408730 |
+------------------------------------------------------------+-----------+

