In [1]:
import pyspark
import numpy as np
from DataUtils import DataUtil
import time
import sys

sc = pyspark.SparkContext('local[4]')

In [2]:
sc.defaultParallelism

4

In [3]:
dataUtil = DataUtil(sc, 'data/spam.data.txt', 'data/mean_std.txt', True)
rdd = dataUtil.read(sc)

In [7]:
def stats(data):
    def f(y, pred):
        if y == pred:
            return 1,0,0,0 if y == 1 else 0,1,0,0
        elif pred == 1:
            return 0,0,1,0
        else:
            return 0,0,0,1
    tp, tn, fp, fn = data.map(lambda x: f(x[2], x[4])).reduce(lambda a, b: tuple(map(sum, zip(a, b))))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    return precision, recall, f1, accuracy

class ParallelLogReg():
    
    def __init__(self, sc, dataUtils, iterations, learning_rate, lambda_reg, fit_intercept):
        self.dataUtils = dataUtils
        # do we need to broadcast these?
        self.iterations = iterations
        self.lr = learning_rate
        self.lambda_reg = lambda_reg
        self.fit_intercept = fit_intercept
        self.data = self.dataUtils.read(sc)
        self.numberObservations = self.data.count()
        self.numberFeatures = 56
        self.sc = sc
        
    def __add_intercept(self):
        self.data = self.data.map(lambda x: (1, x[0], x[1]))
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def __predict_y(self, w, x):
        return self.__sigmoid(w[1].dot(x[1]) + w[0] * x[0])
    
    def calculateLoss(self):
        return 1
        
    def train(self, SGD=False, SGD_pct=0.5):
        self.__add_intercept()
        
        # initialize the weights
        # w[0]: bias weight
        # w[1]: rest of the weights
        w = (0, np.zeros(self.numberFeatures))
        loss = 0
        
        # initialize prediction to rdd
        # x[0]: bias/intercept
        # x[1]: rest features
        # x[2]: true y
        # adding x[3]: predicted y
        self.data = self.data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
        
        data = self.data.cache()
        
        for i in range(self.iterations):
            start = time.time()
            
            # sample for SGD
            if SGD:
                data = self.data.sample(False, SGD_pct).repartition(4).cache()
                if (i==0):
                    self.numberObservations = data.count()
            
            # compute derivatives
            temp = data.map(lambda x: ((x[3] - x[2]) * x[0], (x[3] - x[2]) * x[1])) \
                       .reduce(lambda a,b: (a[0] + b[0], a[1] + b[1]))
            dw = (temp[0]/self.numberObservations, (temp[1]/self.numberObservations) + (self.lambda_reg/self.numberObservations) * w[1])
            
            # update weights
            w = (w[0] - self.lr * dw[0], w[1] - self.lr * dw[1])
            
            # update prediction
            data = data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0]))))).cache()
            
            # calculate loss
            loss = data.map(lambda x: x[2] * np.log(x[3]+sys.float_info.epsilon) + (1 - x[2]) * np.log(1 - x[3]+sys.float_info.epsilon)) \
                                 .reduce(lambda a,b: a + b)
            loss = -(1/self.numberObservations) * loss + (self.lambda_reg/(2*self.numberObservations)) * np.sum(w[1]**2)
            
            if (i%10 == 0):
                print("Current loss: " + str(loss))
                end = time.time()
                print("Iteration: " + str(i) + ", Total Time: " + str(end - start))
                
        # calculate prediction for entire data set
        self.data = self.data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
        
        # update loss for entire data set
        self.numberObservations = self.data.count()
        loss = self.data.map(lambda x: x[2] * np.log(x[3]+sys.float_info.epsilon) + (1 - x[2]) * np.log(1 - x[3]+sys.float_info.epsilon)) \
                   .reduce(lambda a,b: a + b)
        self.loss = -(1/self.numberObservations) * loss + (self.lambda_reg/(2*self.numberObservations)) * np.sum(w[1]**2)
        
        # add the predicted class        
        self.data = self.data.map(lambda x: (x[0], x[1], x[2], x[3], 1 if x[3] >= 0.5 else 0))
        
        # calculate accuracy
        self.acc = self.data.map(lambda x: 1 if x[2] == x[4] else 0) \
                            .reduce(lambda a,b: a+b) / self.numberObservations
        
        print("Final loss: " + str(self.loss) + ", Accuracy: " + str(self.acc))
            
        return w
            

In [13]:
logReg = ParallelLogReg(sc, dataUtil, 100, 0.1, 0.1, True)

In [14]:
logReg.train(True, 0.75)

Current loss: 0.6480574381644949
Iteration: 0, Total Time: 1.448368787765503
Current loss: 0.40873724508468856
Iteration: 10, Total Time: 0.5865907669067383
Current loss: 0.3399603086422459
Iteration: 20, Total Time: 1.0011682510375977
Current loss: 0.31873196836204354
Iteration: 30, Total Time: 0.7110292911529541
Current loss: 0.33717673144534666
Iteration: 40, Total Time: 0.9815123081207275
Current loss: 0.3814004671164082
Iteration: 50, Total Time: 0.9757053852081299
Current loss: 0.4270971917281332
Iteration: 60, Total Time: 0.8669543266296387
Current loss: 0.4527522162739007
Iteration: 70, Total Time: 1.4324626922607422
Current loss: 0.4419625116681218
Iteration: 80, Total Time: 1.1977989673614502
Current loss: 0.5277012771596729
Iteration: 90, Total Time: 0.871208906173706
Final loss: 0.5925931925809359, Accuracy: 0.8935014127363616


(-1.056916594013368,
 array([ 0.61584525, -0.14700108,  0.96454595,  0.28037466,  1.17827348,
         1.14019143,  1.62941519,  1.01852337,  1.13441437,  0.67132225,
         1.15083813,  0.03409795,  0.65077049,  0.29702202,  0.95767808,
         1.2916392 ,  1.2894909 ,  1.00308759,  1.33622548,  0.92630158,
         1.87860037,  0.45182034,  1.63868107,  1.06376794, -1.26175842,
        -1.14562466, -0.89812088, -0.78006939, -0.65711402, -0.83970763,
        -0.62536864, -0.56489974, -0.58483254, -0.55775391, -0.73277265,
        -0.67433107, -0.87350633, -0.15009507, -0.59918394, -0.32105646,
        -0.47252246, -0.6730957 , -0.66515936, -0.46167395, -0.68948029,
        -0.70849883, -0.22445532, -0.41522308, -0.29133626, -0.43790079,
        -0.31639737,  1.18226932,  1.58151289,  0.32425645,  0.54229684,
         1.06275954]))

In [10]:
logReg.acc

0.8930667246250815