In [None]:
import pyspark
import numpy as np
from DataUtils import DataUtil
import time
import sys

sc = pyspark.SparkContext('local[*]')

In [None]:
sc.defaultParallelism

In [None]:
dataUtil = DataUtil(sc, 'data/spam.data.txt', 'data/mean_std.txt', True)
rdd = dataUtil.read(sc)

In [None]:
rdd.count()

In [None]:
dataWithIndex = rdd.zipWithIndex()

In [None]:
dataWithIndex.first()

In [None]:
dataWithIndex.filter(lambda x: x[1] % 3 != 0).count()

In [None]:
k = 5
for i in range(k):
    fold = dataWithIndex.filter(lambda x: x[1] % k == i).repartition(sc.defaultParallelism)
    
    

In [None]:
def stats(data):
    def f(y, pred):
        if y == pred:
            return 1,0,0,0 if y == 1 else 0,1,0,0
        elif pred == 1:
            return 0,0,1,0
        else:
            return 0,0,0,1
    tp, tn, fp, fn = data.map(lambda x: f(x[2], x[4])).reduce(lambda a, b: tuple(map(sum, zip(a, b))))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    return precision, recall, f1, accuracy

class ParallelLogReg():
    
    def __init__(self, sc, dataUtils, iterations, learning_rate, lambda_reg):
        self.dataUtils = dataUtils
        self.iterations = iterations
        self.lr = learning_rate
        self.lambda_reg = lambda_reg
        self.data = self.dataUtils.read(sc)
        self.numberObservations = self.data.count()
        self.numberFeatures = 56
        
    def __add_intercept(self, rdd):
        return rdd.map(lambda x: (1, x[0], x[1]))
        
    def train(self, train_rdd=None, SGD=False, SGD_pct=0.5):
        
        eps = sys.float_info.epsilon
        
        # initialize the weights
        # w[0]: bias weight
        # w[1]: rest of the weights
        w = (0, np.zeros(self.numberFeatures))
        
        if train_rdd == None:
            data = self.__add_intercept(self.data)
        else:
            data = self.__add_intercept(train_rdd)

        # initialize prediction to rdd
        # x[0]: bias/intercept
        # x[1]: rest features
        # x[2]: true y
        # adding x[3]: predicted y
        data = data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
        numObs = data.count()
        print(numObs)
        
        train = data.cache()
        
        for i in range(self.iterations):
            start = time.time()
            
            # sample for SGD
            if SGD:
                train = data.sample(False, SGD_pct).repartition(4).cache()
                if (i==0):
                    numObs = train.count()
            
            # compute derivatives
            temp = train.map(lambda x: ((x[3] - x[2]) * x[0], (x[3] - x[2]) * x[1])) \
                       .reduce(lambda a,b: (a[0] + b[0], a[1] + b[1]))
            dw = (temp[0]/numObs, (temp[1]/numObs) + (self.lambda_reg/numObs) * w[1])
            
            # update weights
            w = (w[0] - self.lr * dw[0], w[1] - self.lr * dw[1])
            
            # update prediction
            train = train.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0]))))).cache()
            
            # calculate loss
            loss = train.map(lambda x: x[2] * np.log(x[3] + eps) + (1 - x[2]) * np.log(1 - x[3] + eps)) \
                                 .reduce(lambda a,b: a + b)
            loss = -(1/numObs) * loss + (self.lambda_reg/(2*numObs)) * np.sum(w[1]**2)
            
            if (i%10 == 0):
                print("Current loss: " + str(loss))
                end = time.time()
                print("Iteration: " + str(i) + ", Total Time: " + str(end - start))
                
        # calculate prediction for entire training set
        data = data.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
        
        # update loss for entire data set
        numObs = data.count()
        loss = data.map(lambda x: x[2] * np.log(x[3] + eps) + (1 - x[2]) * np.log(1 - x[3] + eps)) \
                   .reduce(lambda a,b: a + b)
        loss = -(1/numObs) * loss + (self.lambda_reg/(2*numObs)) * np.sum(w[1]**2)
        
        # add the predicted class        
        data = data.map(lambda x: (x[0], x[1], x[2], x[3], 1 if x[3] >= 0.5 else 0))
        
        # calculate accuracy
        acc = data.map(lambda x: 1 if x[2] == x[4] else 0) \
                            .reduce(lambda a,b: a+b) / numObs
        
        print("Final training loss: " + str(loss) + ", Training Accuracy: " + str(acc))
            
        return {'loss': loss, 'w': w, 'acc': acc}
    
    def validate(self, val_rdd, w):
        
        eps = sys.float_info.epsilon
        
        val_rdd = self.__add_intercept(val_rdd)
        
        numObs = val_rdd.count()
        
        # update prediction
        val_rdd = val_rdd.map(lambda x: (x[0], x[1], x[2], 1 / (1 + np.exp(-(w[1].dot(x[1]) + w[0] * x[0])))))
        
        loss = val_rdd.map(lambda x: x[2] * np.log(x[3] + eps) + (1 - x[2]) * np.log(1 - x[3] + eps)) \
                    .reduce(lambda a,b: a + b)
        loss = -(1/numObs) * loss + (self.lambda_reg/(2*numObs)) * np.sum(w[1]**2)
        
        return loss
    
    def cross_validate(self, k, SGD=False, SGD_pct = 0.5):
        
        if k == 1:
            print("Please choose a k > 1. Or use regular train function")
            return
        
        dataWithIndex = self.data.zipWithIndex();
        
        losses = np.array([])
        models = []
        
        for i in range(k):
            print("Fold: " + str(i))
            
            fold = dataWithIndex.filter(lambda x: x[1] % k != i).repartition(sc.defaultParallelism) \
                                .map(lambda x: x[0])
            
            model = self.train(fold, SGD, SGD_pct)
            models.append(model)
            
            #print(model['w'])
            
            val = dataWithIndex.filter(lambda x: x[1] % k == i).repartition(sc.defaultParallelism) \
                                .map(lambda x: x[0])
            
            #print(val.first())
            losses = np.append(losses, self.validate(val, model['w']))
            print(losses)
            
        print(losses.mean())
            
            
            
        
            

In [None]:
logReg = ParallelLogReg(sc, dataUtil, 100, 0.1, 0.1)

In [None]:
logReg.train()

In [None]:
logReg.cross_validate(2)