In [1]:
import pyspark
import numpy as np
from DataUtils import DataUtil
import time

sc = pyspark.SparkContext('local[*]')

In [2]:
dataUtil = DataUtil(sc, 'data/spam.data.txt', 'data/mean_std.txt', True)
rdd = dataUtil.read(sc)

In [12]:
class ParallelLogReg():
    
    def __init__(self, sc, dataUtils, iterations, learning_rate, lambda_reg, fit_intercept):
        self.dataUtils = dataUtils
        # do we need to broadcast these?
        self.iterations = iterations
        self.lr = learning_rate
        self.lambda_reg = lambda_reg
        self.fit_intercept = fit_intercept
        self.data = self.dataUtils.read(sc)
        self.numberObservations = self.data.count()
        self.numberFeatures = 56#len(self.data.first()[0])
        
    def __add_intercept(self):
        self.data = self.data.map(lambda x: (1, x[0], x[1]))
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def __predict_y(self, w, x):
        return self.__sigmoid(w[1].dot(x[1]) + w[0] * x[0])
    
    def calculateLoss(self):
        return 1
        
    def train(self):
        self.__add_intercept()
        
        # initialize the weights
        # w[0]: bias weight
        # w[1]: rest of the weights
        w = (0, np.zeros(self.numberFeatures))
        self.loss = 0
        
        # initialize prediction to rdd
        # x[0]: bias/intercept
        # x[1]: rest features
        # x[2]: true y
        # adding x[3]: predicted y
        self.data = self.data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
        
        for i in range(self.iterations):
            start = time.time()
            # compute derivatives
            temp = self.data.map(lambda x: ((x[3] - x[2]) * x[0], (x[3] - x[2]) * x[1])) \
                         .reduce(lambda a,b: (a[0] + b[0], a[1] + b[1]))
            self.dw = (temp[0]/self.numberObservations, (temp[1]/self.numberObservations) + (self.lambda_reg/self.numberObservations) * w[1])
            
            # update weights
            w = (w[0] - self.lr * self.dw[0], w[1] - self.lr * self.dw[1])
            
            # update prediction
            self.data = self.data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
            self.loss = self.data.map(lambda x: x[2] * np.log(x[3]) + (1 - x[2]) * np.log(1 - x[3])) \
                                 .reduce(lambda a,b: a + b)
            
            self.loss = -(1/self.numberObservations) * self.loss + (self.lambda_reg/(2*self.numberObservations)) * np.sum(w[1]**2)
            if (i%10 == 0):
                print(self.loss)
                end = time.time()
                print(end - start)
            
            
                
        self.data = self.data.map(lambda x: (x[0], x[1], x[2], x[3], 1 if x[3] >= 0.5 else 0))
        
        self.acc = self.data.map(lambda x: 1 if x[2] == x[4] else 0) \
                            .reduce(lambda a,b: a+b) / self.numberObservations
            
        return w
            

In [19]:
logReg = ParallelLogReg(sc, dataUtil, 100, 0.1, 0.1, True)

In [None]:
logReg.train()

0.6503220242271298
1.3604285717010498
0.4581797205838394
1.9133121967315674
0.39332200478508095
1.796952724456787
0.35989145975858144
3.269209861755371
0.3391484146299772
3.7513320446014404
0.324848301239116
5.427429437637329
0.3142912196692107
8.8939049243927


In [18]:
logReg.acc

0.9065420560747663