In [1]:
import pyspark
import numpy as np
from DataUtils import DataUtil

sc = pyspark.SparkContext('local[*]')

In [2]:
dataUtil = DataUtil(sc, 'data/spam.data.txt', 'data/mean_std.txt', True)
rdd = dataUtil.read(sc)

In [70]:
class ParallelLogReg():
    
    def __init__(self, sc, dataUtils, iterations, learning_rate, lambda_reg, fit_intercept):
        self.dataUtils = dataUtils
        # do we need to broadcast these?
        self.iterations = iterations
        self.lr = learning_rate
        self.lambda_reg = lambda_reg
        self.fit_intercept = fit_intercept
        self.data = self.dataUtils.read(sc)
        self.numberObservations = self.data.count()
        self.numberFeatures = 56#len(self.data.first()[0])
        
    def __add_intercept(self):
        self.data = self.data.map(lambda x: (1, x[0], x[1]))
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def __predict_y(self, w, x):
        return self.__sigmoid(w[1].dot(x[1]) + w[0] * x[0])
    
    def calculateLoss(self):
        return 1
        
    def train(self):
        self.__add_intercept()
        
        # initialize the weights
        # w[0]: bias weight
        # w[1]: rest of the weights
        w = (0, np.zeros(self.numberFeatures))
        self.loss = 0
        
        # initialize prediction to rdd
        # x[0]: bias/intercept
        # x[1]: rest features
        # x[2]: true y
        # adding x[3]: predicted y
        self.data = self.data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
        
        for i in range(self.iterations):
            # compute derivatives
            temp = self.data.map(lambda x: ((x[3] - x[2]) * x[0], (x[3] - x[2]) * x[1])) \
                         .reduce(lambda a,b: (a[0] + b[0], a[1] + b[1]))
            self.dw = (temp[0]/self.numberObservations, (temp[1]/self.numberObservations) + (self.lambda_reg/self.numberObservations) * w[1])
            
            # update weights
            w = (w[0] - self.lr * self.dw[0], w[1] - self.lr * self.dw[1])
            
            # update prediction
            self.data = self.data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
            self.loss = self.data.map(lambda x: x[2] * np.log(x[3]) + (1 - x[2]) * np.log(1 - x[3])) \
                                 .reduce(lambda a,b: a + b)
            
            self.loss = -(1/self.numberObservations) * self.loss + (self.lambda_reg/(2*self.numberObservations)) * np.sum(w[1]**2)
            if (i%10 == 0):
                print(self.loss)
            
        return w
            

In [73]:
logReg = ParallelLogReg(sc, dataUtil, 100, 0.01, 0.01, True)

In [None]:
logReg.train()

0.6886640115564672
0.6483598250629653
0.6149351436759691
0.5868143057599317
0.5628412948708029
0.5421668988746755
0.5241557659059918
0.5083237589517262
0.4942960005649694
0.48177809148641654


In [50]:
logReg.data.first()

(1, array([-0.34238276,  0.33084612,  0.7127782 , -0.04689292,  0.0115686 ,
        -0.35023373, -0.29176885, -0.26252275, -0.32326107, -0.37131646,
        -0.29683438,  0.11407682, -0.31201834, -0.174909  , -0.19009813,
         0.08615992, -0.32110526,  2.08099864,  0.15088707, -0.16787375,
         0.12511659, -0.11815169, -0.29018813, -0.21296991, -0.32878597,
        -0.29920177, -0.22786802, -0.23179472, -0.16671161, -0.2252124 ,
        -0.16052208, -0.14319455, -0.17490061, -0.14519654, -0.19804231,
        -0.2421067 , -0.32341481, -0.05982778, -0.18089355, -0.18528235,
        -0.12089424, -0.17258287, -0.20597382, -0.12732885, -0.29773648,
        -0.19736149, -0.07138081, -0.11153537, -0.15843841, -0.51424027,
        -0.15518786,  0.62394105, -0.3083252 , -0.10303722, -0.04524252,
         0.04529222]), 1.0, 0.5005093777015289)

In [None]:
class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            if(self.verbose == True and i % 10000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print('loss: {' + str(self.__loss(h, y)) + '} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold=0.5):
        return self.predict_prob(X) >= threshold