In [1]:
import pyspark
import numpy as np
from DataUtils import DataUtil
import time

sc = pyspark.SparkContext('local[4]')

In [2]:
sc.defaultParallelism

4

In [3]:
dataUtil = DataUtil(sc, 'data/spam.data.txt', 'data/mean_std.txt', True)
rdd = dataUtil.read(sc)

In [7]:
class ParallelLogReg():
    
    def __init__(self, sc, dataUtils, iterations, learning_rate, lambda_reg, fit_intercept):
        self.dataUtils = dataUtils
        # do we need to broadcast these?
        self.iterations = iterations
        self.lr = learning_rate
        self.lambda_reg = lambda_reg
        self.fit_intercept = fit_intercept
        self.data = self.dataUtils.read(sc)
        self.numberObservations = self.data.count()
        self.numberFeatures = 56
        self.sc = sc
        
    def __add_intercept(self):
        self.data = self.data.map(lambda x: (1, x[0], x[1]))
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def __predict_y(self, w, x):
        return self.__sigmoid(w[1].dot(x[1]) + w[0] * x[0])
    
    def calculateLoss(self):
        return 1
        
    def train(self, SGD=False, SGD_pct=0.5):
        self.__add_intercept()
        
        # initialize the weights
        # w[0]: bias weight
        # w[1]: rest of the weights
        w = (0, np.zeros(self.numberFeatures))
        loss = 0
        
        # initialize prediction to rdd
        # x[0]: bias/intercept
        # x[1]: rest features
        # x[2]: true y
        # adding x[3]: predicted y
        self.data = self.data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
        
        data = self.data.cache()
        
        for i in range(self.iterations):
            start = time.time()
            
            # sample for SGD
            if SGD:
                data = self.data.sample(False, SGD_pct).cache()
                if (i==0):
                    self.numberObservations = data.count()
            
            # compute derivatives
            temp = data.map(lambda x: ((x[3] - x[2]) * x[0], (x[3] - x[2]) * x[1])) \
                       .reduce(lambda a,b: (a[0] + b[0], a[1] + b[1]))
            dw = (temp[0]/self.numberObservations, (temp[1]/self.numberObservations) + (self.lambda_reg/self.numberObservations) * w[1])
            
            # update weights
            w = (w[0] - self.lr * dw[0], w[1] - self.lr * dw[1])
            
            # update prediction
            data = data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0]))))).cache()
            
            # calculate loss
            loss = data.map(lambda x: x[2] * np.log(x[3]) + (1 - x[2]) * np.log(1 - x[3])) \
                                 .reduce(lambda a,b: a + b)
            loss = -(1/self.numberObservations) * loss + (self.lambda_reg/(2*self.numberObservations)) * np.sum(w[1]**2)
            
            if (i%10 == 0):
                print("Current loss: " + str(loss))
                end = time.time()
                print("Iteration: " + str(i) + ", Total Time: " + str(end - start))
                
        # calculate prediction for entire data set
        self.data = self.data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
        
        # update loss for entire data set
        self.numberObservations = self.data.count()
        loss = self.data.map(lambda x: x[2] * np.log(x[3]) + (1 - x[2]) * np.log(1 - x[3])) \
                   .reduce(lambda a,b: a + b)
        self.loss = -(1/self.numberObservations) * loss + (self.lambda_reg/(2*self.numberObservations)) * np.sum(w[1]**2)
        
        # add the predicted class        
        self.data = self.data.map(lambda x: (x[0], x[1], x[2], x[3], 1 if x[3] >= 0.5 else 0))
        
        # calculate accuracy
        self.acc = self.data.map(lambda x: 1 if x[2] == x[4] else 0) \
                            .reduce(lambda a,b: a+b) / self.numberObservations
        
        print("Final loss: " + str(self.loss) + ", Accuracy: " + str(self.acc))
            
        return w
            

In [8]:
logReg = ParallelLogReg(sc, dataUtil, 100, 0.1, 0.1, True)

In [9]:
logReg.train(False, 0.1)

Current loss: 0.6503220242271298
Iteration: 0, Total Time: 1.2008984088897705
Current loss: 0.4581797205838394
Iteration: 10, Total Time: 0.7159209251403809
Current loss: 0.39332200478508095
Iteration: 20, Total Time: 0.46668338775634766
Current loss: 0.35989145975858144
Iteration: 30, Total Time: 0.5080475807189941
Current loss: 0.3391484146299772
Iteration: 40, Total Time: 0.4967775344848633
Current loss: 0.324848301239116
Iteration: 50, Total Time: 0.5305984020233154
Current loss: 0.3142912196692107
Iteration: 60, Total Time: 0.5444254875183105
Current loss: 0.30611242960061824
Iteration: 70, Total Time: 0.6343545913696289
Current loss: 0.2995456845167399
Iteration: 80, Total Time: 0.5325527191162109
Current loss: 0.29412675827114465
Iteration: 90, Total Time: 0.5699419975280762
Final loss: 0.2899825061314961, Accuracy: 0.9065420560747663


(-0.4283915364779883,
 array([-0.00622156, -0.06707946,  0.12603675,  0.10679152,  0.28608055,
         0.20375194,  0.49958148,  0.24671533,  0.22264238,  0.09935942,
         0.12813877, -0.10040145,  0.05902925,  0.04332017,  0.18387406,
         0.36372831,  0.28676633,  0.20048329,  0.15539207,  0.22677141,
         0.31132481,  0.19344826,  0.4381361 ,  0.26738814, -0.29162514,
        -0.22380624, -0.25198122, -0.05837027, -0.11482588, -0.13995595,
        -0.07090135, -0.03828431, -0.15286331, -0.03645856, -0.11456854,
        -0.02073074, -0.1365954 , -0.05547121, -0.12921777, -0.00276266,
        -0.11337995, -0.20190582, -0.11778833, -0.13805032, -0.23019683,
        -0.22506779, -0.08137009, -0.11471476, -0.1139385 , -0.04269744,
        -0.05166024,  0.33324663,  0.4674863 ,  0.11167972,  0.12264795,
         0.28245354]))

In [None]:
logReg.acc