In [2]:
# Import relevant packages
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf


In [3]:
conf = SparkConf().setAppName('unionLogs').setMaster("local[*]")   # It will run all the available cores on local cpu
sc = SparkContext(conf=conf)

In [4]:
# Import log data from tsv files
filePath = 'file:///E:/Eskills-Academy-projects/python-spark-tutorial-master/in/'

julyLogFileName = "nasa_19950701.tsv"
julyFirstLogFile = filePath+julyLogFileName
julyFirstLogs = sc.textFile(julyFirstLogFile)

augustLogFileName = "nasa_19950801.tsv"
augustFirstLogFile = filePath+augustLogFileName
augustFirstLogs = sc.textFile(augustFirstLogFile)

In [4]:
# Call union method on july and august log rdds
aggregatedLogLines = julyFirstLogs.union(augustFirstLogs)

# Filter out header lines, the input files have header starting with "host" and ending with "bytes"
def isNotHeader(line: str):
    return not (line.startswith("host") and "bytes" in line)

cleanLogLines = aggregatedLogLines.filter(isNotHeader)

In [5]:
# Create and save a sample of 10% data to an output folder
sample = cleanLogLines.sample(withReplacement=True, fraction=0.1)
outSampleFolder = 'sampleNasaLogLines.csv'
sample.saveAsTextFile(outSampleFolder)

In [5]:
# Logs with same hosts from July 1st and August 1st
julyFirstHosts = julyFirstLogs.map(lambda line: line.split("\t")[0])
augustFirstHosts = augustFirstLogs.map(lambda line: line.split("\t")[0])

# Get same host data, and remove the header name 'host'
sameHosts = julyFirstHosts.union(augustFirstHosts).filter(lambda host: host !="host")

# Save to folder
outFolder = "nasaSameHosts.csv"
sameHosts.saveAsTextFile(outFolder)