In [None]:
# Import relevant packages
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from Utils import Utils


In [None]:
configuration = SparkConf().setAppName("StackOverflowSurvey").setMaster("local[*]")
sc = SparkContext(conf=configuration)


In [None]:
# Inititialize Accumulators 
total = sc.accumulator(0)   # To count the total number of record
missingSalaryMidPoint = sc.accumulator(0)   # To count the recordes with missing salary midpoints
processedBytes = sc.accumulator(0)

In [None]:
filePath = 'e:\\Eskills-Academy-projects\\python-spark-tutorial-master\\in\\'
fileName = "2016-stack-overflow-survey-responses.csv"
responseRDD = sc.textFile(filePath+fileName)

In [None]:
def filterResponseFromCanada(response):
    processedBytes.add(len(response.encode('utf-8')))
    splits = Utils.COMMA_DELIMITER.split(response)
    total.add(1)
    if not splits[14]:
        missingSalaryMidPoint.add(1)
    return splits[2] == "Canada"

responseFromCanada = responseRDD.filter(filterResponseFromCanada)

In [None]:
print("Count of responses from Canada:", responseFromCanada.count())
print("Total count of responses:", total.value)
print("Count of responses with missing salary midpoint:", missingSalaryMidPoint.value)
print("Number of bytes processes:", processedBytes.value)


In [None]:
# Broadcast variables
# How are those make spaces distributed across different regions in the UK
    # 1. Load postcode dataset and broadcast it across the cluster
    # 2. load the maker space dataset and call map operation on the maker space RDD 
    #    to look up the region using the postcode of the make space
filePath = 'e:\\Eskills-Academy-projects\\python-spark-tutorial-master\\in\\'
fileName = "uk-makerspaces-identifiable-data.csv"
makerSpaceRDD = sc.textFile(filePath+fileName)


In [None]:
fileName2 = "uk-postcode.csv"

def loadPostCodeMap():
    lines = open(filePath+fileName2, "r").read().split("\n")
    splitsForLines = [Utils.COMMA_DELIMITER.split(line) for line in lines if line != ""]
    return {splits[0]: splits[7] for splits in splitsForLines}  #returns a dictionary of postcode prefix as keys and region as value

def getPostPrefix(line: str):
    splits = Utils.COMMA_DELIMITER.split(line)
    postCode = splits[4]
    return None if not postCode else postCode.split(" ")[0]



In [None]:
postCodeMap = sc.broadcast(loadPostCodeMap())
regions = makerSpaceRDD.filter(lambda line: Utils.COMMA_DELIMITER.split(line)[0]!= "Timestamp"
                               ).filter(lambda line: getPostPrefix(line) is not None
                                        ).map(lambda line: postCodeMap.value[getPostPrefix(line)] 
                                              if getPostPrefix(line) in postCodeMap.value else "Unknown")

In [None]:
for region, count in regions.countByValue().items():
    print("{}:{}".format(region, count))