# DATASCI W261: Machine Learning at Scale
## Assignment Week 13
Miki Seltzer (miki.seltzer@berkeley.edu)<br>
W261-2, Spring 2016<br>
Submission: 

In [1]:
# If running locally, start PySpark
import os
import sys
spark_home = os.environ.get('SPARK_HOME', None)

if not spark_home:
    raise ValueError('SPARK_HOME enviroment variable is not set')
sys.path.insert(0,os.path.join(spark_home,'python'))
sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip'))
execfile(os.path.join(spark_home,'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 1.5.0-cdh5.5.0
      /_/

Using Python version 2.7.11 (default, Dec  6 2015 18:08:32)
SparkContext available as sc, HiveContext available as sqlContext.


# HW 13.4: Criteo Phase 2 Baseline
Using the training dataset, validation dataset and testing dataset in the Criteo bucket perform the following experiment:

- Write spark code (borrow from Phase 1 of this project) to train a logistic regression model with the following hyperparamters:
  - Number of buckets for hashing: 1,000
  - Logistic Regression: no regularization term
  - Logistic Regression: step size = 10

In [14]:
rawTrainData = (sc
              .textFile('s3://criteo-dataset/rawdata/train')
              .map(lambda x: x.replace('\t', ','))
              .cache()
             )
rawValidationData = (sc
                     .textFile('s3://criteo-dataset/rawdata/validation')
                     .map(lambda x: x.replace('\t', ','))
                     .cache()
                    )
rawTestData = (sc
               .textFile('s3://criteo-dataset/rawdata/test')
               .map(lambda x: x.replace('\t', ','))
               .cache()
              )

print rawTrainData.take(1)

[u'0,1,1,5,0,1382,4,15,2,181,1,2,,2,68fd1e64,80e26c9b,fb936136,7b4723c4,25c83c98,7e0ccccf,de7995b8,1f89b562,a73ee510,a8cd5504,b2cb9c98,37c9c164,2824a5f6,1adce6ef,8ba8b39a,891b62e7,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16']


### Hash function

In [2]:
from collections import defaultdict
import hashlib

def hashFunction(numBuckets, rawFeats, printMapping=False):
    """Calculate a feature dictionary for an observation's features based on hashing.

    Note:
        Use printMapping=True for debug purposes and to better understand how the hashing works.

    Args:
        numBuckets (int): Number of buckets to use as features.
        rawFeats (list of (int, str)): A list of features for an observation.  Represented as
            (featureID, value) tuples.
        printMapping (bool, optional): If true, the mappings of featureString to index will be
            printed.

    Returns:
        dict of int to float:  The keys will be integers which represent the buckets that the
            features have been hashed to.  The value for a given key will contain the count of the
            (featureID, value) tuples that have hashed to that key.
    """
    mapping = {}
    for ind, category in rawFeats:
        featureString = category + str(ind)
        mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
    if(printMapping): print mapping
    sparseFeatures = defaultdict(float)
    for bucket in mapping.values():
        sparseFeatures[bucket] += 1.0
    return dict(sparseFeatures)

### Parse data to convert the string into hashed features

In [15]:
import numpy as np
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

def parseHashPoint(point, numBuckets):
    """Create a LabeledPoint for this observation using hashing.

    Args:
        point (str): A comma separated string where the first value is the label and the rest are
            features.
        numBuckets: The number of buckets to hash to.

    Returns:
        LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
            features.
    """
    
    fields = point.split(',')
    label = fields[0]
    features = zip(range(len(fields[1:])), fields[1:])
    
    return LabeledPoint(label, SparseVector(numBuckets, hashFunction(numBuckets, features)))

numBucketsCTR = 1000
hashTrainData = rawTrainData.map(lambda x: parseHashPoint(x, numBucketsCTR))
hashTrainData.cache()

print hashTrainData.take(1)

[LabeledPoint(0.0, (1000,[64,101,117,147,178,215,223,268,304,313,321,328,384,385,442,532,601,613,619,621,628,644,650,655,659,680,681,697,721,738,742,824,846,882,903,924],[1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]))]


### Log-loss functions

In [6]:
from math import log

def computeLogLoss(p, y):
    """Calculates the value of log loss for a given probabilty and label.

    Note:
        log(0) is undefined, so when p is 0 we need to add a small value (epsilon) to it
        and when p is 1 we need to subtract a small value (epsilon) from it.

    Args:
        p (float): A probabilty between 0 and 1.
        y (int): A label.  Takes on the values 0 and 1.

    Returns:
        float: The log loss value.
    """
    epsilon = 10e-12
    if p == 0:
        prob = epsilon
    elif p == 1:
        prob = 1 - epsilon
    else:
        prob = p * 1.
    return -(y * log(prob) + (1 - y) * log(1 - prob))

In [7]:
from math import exp #  exp(-t) = e^-t

def getP(x, w, intercept):
    """Calculate the probability for an observation given a set of weights and intercept.

    Note:
        We'll bound our raw prediction between 20 and -20 for numerical purposes.

    Args:
        x (SparseVector): A vector with values of 1.0 for features that exist in this
            observation and 0.0 otherwise.
        w (DenseVector): A vector of weights (betas) for the model.
        intercept (float): The model's intercept.

    Returns:
        float: A probability between 0 and 1.
    """
    rawPrediction = x.dot(w) + intercept

    # Bound the raw prediction value
    rawPrediction = min(rawPrediction, 20)
    rawPrediction = max(rawPrediction, -20)
    return 1 / (1 + exp(-rawPrediction))

In [8]:
def evaluateResults(model, data):
    """Calculates the log loss for the data given the model.

    Args:
        model (LogisticRegressionModel): A trained logistic regression model.
        data (RDD of LabeledPoint): Labels and features for each observation.

    Returns:
        float: Log loss for the data.
    """
    return (data
            .map(lambda x: computeLogLoss(getP(x.features, model.weights, model.intercept), x.label))
            .reduce(lambda x, y: x + y)) / data.count()

### Logistic Regression with hashed features

In [18]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
import time

# fixed hyperparameters
stepSize = 10.
regType = None
numIters = 5

startTime = time.time()
model = (LogisticRegressionWithSGD
         .train(hashTrainData, iterations=numIters, step=stepSize, regType=regType)
        )
endTime = time.time()

print 'Model training with', numIters, 'iterations took', int(endTime - startTime), 'seconds.'

Model training with 5 iterations took 578 seconds.


### Logloss

In [11]:
logLossTrain = evaluateResults(model, hashTrainData)

hashValidationData = rawValidationData.map(lambda x: parseHashPoint(x, numBucketsCTR)).cache()
logLossVal = evaluateResults(model, hashValidationData)

hashTestData = rawTestData.map(lambda x: parseHashPoint(x, numBucketsCTR)).cache()
logLossTest = evaluateResults(model, hashTestData)

print '{:12s}{:10s}'.format('Data set', 'Logloss')
print '-'*22

table1 = '{:12s}{:10.8f}'.format
print table1('Train', logLossTrain)
print table1('Validation', logLossVal)
print table1('Test', logLossTest)

Data set    Logloss   
----------------------
Train       0.48997141
Validation  0.49458504
Test        0.49517179


### AUC

In [13]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

def getAUC(data):
    metrics = BinaryClassificationMetrics(data.map(lambda lp: (float(model.predict(lp.features)), lp.label)))
    return metrics.areaUnderROC

trainAUC = getAUC(hashTrainData)
valAUC = getAUC(hashValidationData)
testAUC = getAUC(hashTestData)

print '{:12s}{:10s}'.format('Data set', 'AUC')
print '-'*22

table1 = '{:12s}{:10.8f}'.format
print table1('Train', trainAUC)
print table1('Validation', valAUC)
print table1('Test', testAUC)

Data set    AUC       
----------------------
Train       0.58609171
Validation  0.58040403
Test        0.58546408
