In [1]:
# (1) Read and parse the initial dataset

# The raw data is currently stored in text file. 
# You need to store this raw data in an RDD, with each element of the RDD representing a data point as a comma-delimited string. 
# Each string starts with the label (a year) followed by numerical audio features. 
# Then check how many data points you have and print out a list of the first 5 data points.
# Choose the number of partitions is 2.

In [2]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setMaster('spark://master:7077').setAppName('Join')
sc = SparkContext(conf=conf)
sc.setLogLevel('ERROR')
sc.version
sc.getConf().getAll()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/07 13:59:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED'),
 ('spark.app.name', 'Join'),
 ('spark.master', 'spark://master:7077'),
 ('spark.driver.host', 'master'),
 ('spark.app.startTime', '1673099968903'),
 ('spark.driver.port', '38083'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 

In [3]:
# (1a) Firstly, copy the file YearPredictionMSD.txt from remote server to HDFS
# Then, load data and check
import os.path
from itertools import islice
baseDir = os.path.join('data')
inputPath = os.path.join('Windsor_Housing_large.csv')
fileName = os.path.join(baseDir, inputPath)

numPartitions = 8

rawData = sc.textFile(fileName, numPartitions)\
            .mapPartitionsWithIndex(
                lambda index, it: islice(it, 1, None) if index == 0 else it # remove header line
            )\
            .map(lambda x: x.split(','))

In [4]:
# TODO: Replace <FILL IN> with appropriate code
numPoints = rawData.count()
print(numPoints)
samplePoints = rawData.take(5)
print(samplePoints)

                                                                                

2546
[['0', '42000.0', '5850', '3', '1', '2', 'yes', 'no', 'yes', 'no', 'no', '1', 'no'], ['1', '38500.0', '4000', '2', '1', '1', 'yes', 'no', 'no', 'no', 'no', '0', 'no'], ['2', '49500.0', '3060', '3', '1', '1', 'yes', 'no', 'no', 'no', 'no', '0', 'no'], ['3', '60500.0', '6650', '3', '1', '2', 'yes', 'yes', 'no', 'no', 'no', '0', 'no'], ['4', '61000.0', '6360', '2', '1', '1', 'yes', 'no', 'no', 'no', 'no', '0', 'no']]


In [5]:
# load testing library
#from test_helper import assertEquals
# I cannot import test_helper library
def assertEquals(src, target, msg):
    assert src == target, msg

def assertTrue(src, msg):
    assert src, msg

In [6]:
# (1b) Using LabeledPoint
# LabeledPoint is an object of the library pyspark.mllib.regression.
# You need to use it to parse the features and label for the list of data point from the previous question.
# Write a function that takes an input as a raw data point, parses it by using Python's unicode.split method and returns a LabeledPoint. 
# Then print out the features and label for the point. Finally, calculate the number features for this dataset.

In [7]:
from pyspark.mllib.regression import LabeledPoint
import numpy as np

In [8]:
# TODO: Replace <FILL IN> with appropriate code
def parsePoint(line):
    """Converts a comma separated unicode string into a `LabeledPoint`.

    Args:
        line (unicode): Comma separated unicode string where the first element is the label and the remaining elements are features.

    Returns:
        LabeledPoint: The line is converted into a `LabeledPoint`, which consists of a label and features.
    """
    return LabeledPoint(line[1], [1] + [1 if x in ['"yes"', 'yes'] else 0 if x in ['"no"', 'no'] else x for x in line[2:]])

parsedSamplePoints = [parsePoint(point) for point in samplePoints]
firstPointFeatures = parsedSamplePoints[0].features
firstPointLabel = parsedSamplePoints[0].label
print(firstPointFeatures, firstPointLabel)

d = len(firstPointFeatures)
print(d)

[1.0,5850.0,3.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0] 42000.0
12


In [9]:
# (1c) Find the range
# Let's examine the labels to find the range of song years. 
# To do this, first parse each element of the rawData RDD, and then find the smallest and largest labels.

In [10]:
# TODO: Replace <FILL IN> with appropriate code
parsedDataInit = rawData.map(lambda x: parsePoint(x))
onlyLabels = parsedDataInit.map(lambda x: x.label)
minPrice = onlyLabels.takeOrdered(1)
maxPrice = onlyLabels.takeOrdered(1, lambda x: -x)
print(minPrice, maxPrice)

                                                                                

[15200.0] [1988100.0]


In [11]:
# (1d) Shift labels 
# The labels are years in the 1900s and 2000s.
# We should shift them such that they start from zero.

In [12]:
# TODO: Replace <FILL IN> with appropriate code
parsedData = parsedDataInit

# Should be a LabeledPoint
print(type(parsedData.take(1)[0]))
# View the first point
print('\n{0}'.format(parsedData.take(1))) 

<class 'pyspark.mllib.regression.LabeledPoint'>

[LabeledPoint(42000.0, [1.0,5850.0,3.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0])]


In [13]:
# (1e) Training, validation, and test sets 

In [14]:
# TODO: Replace <FILL IN> with appropriate code
# Firstly, we will split the dataset into training, validation and test sets.
# Use the randomSplit method with the specified weights and seed to create RDDs storing each of these datasets.
weights = [.8, .1, .1]
seed = 42
parsedTrainData, parsedValData, parsedTestData = parsedData.randomSplit(weights, seed)

# Then, we cache each of these RDDs, as we will be accessing them multiple times.
parsedTrainData.cache()
parsedValData.cache()
parsedTestData.cache()

# Finally, compute the size of each dataset and verify the sum of three sizes.
nTrain = parsedTrainData.count()
nVal = parsedValData.count()
nTest = parsedTestData.count()

print(nTrain, nVal, nTest, nTrain + nVal + nTest)
print(parsedData.count())

                                                                                

2013 274 259 2546
2546


In [15]:
# (2b) Root mean squared error 
# We want to know how well the naive baseline performs.
# Therefore, we will use root mean squared error (RMSE) for evaluation purposes.

In [16]:
import math
def squaredError(label, prediction):
    """Calculates the the squared error for a single prediction.

    Args:
        label (float): The correct value for this observation.
        prediction (float): The predicted value for this observation.

    Returns:
        float: The difference between the `label` and `prediction` squared.
    """
    return (label - prediction) ** 2

def calcRMSE(labelsAndPreds):
    """Calculates the root mean squared error for an `RDD` of (label, prediction) tuples.

    Args:
        labelsAndPred (RDD of (float, float)): An `RDD` consisting of (label, prediction) tuples.

    Returns:
        float: The square root of the mean of the squared errors.
    """
    squared_error = labelsAndPreds.map(lambda x: squaredError(x[0], x[1])).sum() / labelsAndPreds.count()
    return math.sqrt(squared_error)

In [17]:
# Example with a small RDD of (label, prediction) tuples
labelsAndPreds = sc.parallelize([(3., 1.), (1., 2.), (2., 2.)])
# RMSE = sqrt[((3-1)^2 + (1-2)^2 + (2-2)^2) / 3] = 1.291
exampleRMSE = calcRMSE(labelsAndPreds)
print(exampleRMSE)

1.2909944487358056


In [18]:
# TEST: Root mean squared error 
assertTrue(np.allclose(squaredError(3, 1), 4.), 'incorrect definition of squaredError')
assertTrue(np.allclose(exampleRMSE, 1.2909944487358056), 'incorrect value for exampleRMSE')

In [19]:
# (3) Train (via gradient descent) and evaluate a linear regression model

# We can do a better prediction via linear regression, by using gradient descent (omit the intercept)

In [20]:
# (3a) Gradient descent

# TODO: Replace <FILL IN> with appropriate code
# First, we implement a function that computes the summand for the update, i.e. (w⊤x−y)x.
def gradientSummand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`.

    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        weights (DenseVector): An array of model weights (betas).
        lp (LabeledPoint): The `LabeledPoint` for a single observation.

    Returns:
        DenseVector: An array of values the same length as `weights`.  The gradient summand.
    """
    return (weights.dot(lp.features.toArray()) - lp.label) * lp.features

In [21]:
from pyspark.ml.linalg import DenseVector
# And then, test out this function.
exampleW = DenseVector([1, 1, 1])
exampleLP = LabeledPoint(2.0, [3, 1, 4])
# gradientSummand = (dot([1 1 1], [3 1 4]) - 2) * [3 1 4] = (8 - 2) * [3 1 4] = [18 6 24]
summandOne = gradientSummand(exampleW, exampleLP)
print(summandOne)

exampleW = DenseVector([.24, 1.2, -1.4])
exampleLP = LabeledPoint(3.0, [-1.4, 4.2, 2.1])
summandTwo = gradientSummand(exampleW, exampleLP)
print(summandTwo)

[18.0,6.0,24.0]
[1.7304000000000002,-5.191200000000001,-2.5956000000000006]


In [22]:
# TEST: Gradient summand
assertTrue(np.allclose(summandOne, [18., 6., 24.]), 'incorrect value for summandOne')
assertTrue(np.allclose(summandTwo, [1.7304, -5.1912, -2.5956]), 'incorrect value for summandTwo')

In [23]:
# (3b) Use weights to make predictions

# TODO: Replace <FILL IN> with appropriate code
# We will implement a getLabeledPredictions function that computes the dot product between weights and observation's features.
# The function returns a (label, prediction) tuple.
def getLabeledPrediction(weights, observation):
    """Calculates predictions and returns a (label, prediction) tuple.

    Note:
        The labels should remain unchanged as we'll use this information to calculate prediction
        error later.

    Args:
        weights (np.ndarray): An array with one weight for each features in `trainData`.
        observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the
            features for the data point.

    Returns:
        tuple: A (label, prediction) tuple.
    """
    return (observation.label, observation.features.dot(weights))

In [24]:
# Test out the function with a small example
weights = np.array([1.0, 1.5])
predictionExample = sc.parallelize([LabeledPoint(2, np.array([1.0, .5])),
                                    LabeledPoint(1.5, np.array([.5, .5]))])
labelsAndPredsExample = predictionExample.map(lambda lp: getLabeledPrediction(weights, lp))
print(labelsAndPredsExample.collect())

[(2.0, 1.75), (1.5, 1.25)]


In [25]:
# TEST: Use weights to make predictions 
assertEquals(labelsAndPredsExample.collect(), [(2.0, 1.75), (1.5, 1.25)], 'incorrect definition for getLabeledPredictions')

In [26]:
# (3c) Gradient descent 
# We will implement a gradient descent function for linear regression 
# And then test out this function on an example.

In [27]:
# TODO: Replace <FILL IN> with appropriate code
def linregGradientDescent(trainData, numIters):
    """Calculates the weights and error for a linear regression model trained with gradient descent.

    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        trainData (RDD of LabeledPoint): The labeled data for use in training the model.
        numIters (int): The number of iterations of gradient descent to perform.

    Returns:
        (np.ndarray, np.ndarray): A tuple of (weights, training errors).  Weights will be the
            final weights (one weight per feature) for the model, and training errors will contain
            an error (RMSE) for each iteration of the algorithm.
    """
    # The length of the training data
    n = trainData.count()
    # The number of features in the training data
    d = len(trainData.take(1)[0].features)
    w = np.zeros(d)
    alpha = 0.00001
    # We will compute and store the training error after each iteration
    errorTrain = np.zeros(numIters)
    for i in range(numIters):
        # Use getLabeledPrediction from (3b) with trainData to obtain an RDD of (label, prediction) tuples. 
        # Note that the weights all equal 0 for the first iteration, so the predictions will have large errors to start.
        labelsAndPredsTrain = trainData.map(lambda x: getLabeledPrediction(w, x))
        errorTrain[i] = calcRMSE(labelsAndPredsTrain)

        # Calculate the `gradient`.  Make use of the `gradientSummand` function you wrote in (3a).
        # Note that `gradient` sould be a `DenseVector` of length `d`.
        gradient = trainData.map(lambda x: gradientSummand(w, x)).sum() / n
        # Update the weights
        alpha_i = alpha / (n * np.sqrt(i+1))
#         print(labelsAndPredsTrain.take(1))
#         print(gradient, alpha_i)
        w -= alpha_i * gradient
#         print(w)
    return w, errorTrain

In [28]:
# Create a toy dataset with n = 10, d = 3, and then run 5 iterations of gradient descent.
# The resulting model will not be useful.
# The goal here is to verify that linregGradientDescent is working properly.
exampleN = 10
exampleD = 3
exampleData = (sc
               .parallelize(parsedTrainData.take(exampleN))
               .map(lambda lp: LabeledPoint(lp.label, lp.features[0:exampleD])))
print(exampleData.take(2))
exampleNumIters = 5
exampleWeights, exampleErrorTrain = linregGradientDescent(exampleData, exampleNumIters)
print(exampleWeights)
print(exampleErrorTrain)

[LabeledPoint(42000.0, [1.0,5850.0,3.0]), LabeledPoint(38500.0, [1.0,4000.0,2.0])]


                                                                                

[2.06533317e+03 1.10767552e+07 5.76909680e+03]
[6.45547752e+04 1.60214545e+06 2.88484419e+07 4.18861965e+08
 5.21072415e+09]


In [29]:
# (3d) Train the model
# Let's train a linear regression model on all of our training data.
# Then evaluate its accuracy on the validation set.

In [30]:
# TODO: Replace <FILL IN> with appropriate code
numIters = 50
weightsLR0, errorTrainLR0 = linregGradientDescent(parsedData, numIters)

labelsAndPreds = parsedValData.map(lambda x: getLabeledPrediction(weightsLR0, x))
rmseValLR0 = calcRMSE(labelsAndPreds)

print('Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format(0, rmseValLR0)) 

                                                                                

Validation RMSE:
	Baseline = 0.000
	LR0 = 193716.707
