In [1]:
import pyspark
import numpy as np
import pandas as pd
import time
import sys

sc = pyspark.SparkContext('local[*]')

In [2]:
sc.defaultParallelism

2

In [3]:
class DataUtil:
    def __init__(self, sc, inputPath='data/spam.data.txt', statsInputPath='data/mean_std.txt', standardize=True):
        self.inputPath = inputPath
        self.standardize = standardize
        self.mean_std = pd.read_csv(statsInputPath, delimiter=',', header=None)
        self.mean = sc.broadcast(self.mean_std.iloc[:,0].values.astype(float))
        self.std = sc.broadcast(self.mean_std.iloc[:,1].values.astype(float))
        
        
    def read(self, sc):
        if self.standardize:
            return sc.textFile(self.inputPath).map(lambda x: np.asarray(x.split(' ')).astype(float)) \
                    .map(lambda x: (x[:56], x[57])) \
                    .map(lambda x: ((x[0] - self.mean.value[:56])/self.std.value[:56], x[1]))
        else:
            return sc.textFile(self.inputPath).map(lambda x: np.asarray(x.split(' ')).astype(float)) \
                    .map(lambda x: (x[:56], x[57]))

In [4]:
spam_dataset = DataUtil(sc, 'data/spam.data.txt', 'data/mean_std.txt', True).read(sc)
train_set, test_set = spam_dataset.randomSplit(weights=[0.8, 0.2], seed=1)

print("Whole dataset: {}, {}".format(spam_dataset.count(), len(spam_dataset.first()[0])))
print("Train dataset: {}, {}".format(train_set.count(), len(train_set.first()[0])))
print("Test dataset: {}, {}".format(test_set.count(), len(test_set.first()[0])))

Whole dataset: 4601, 56
Train dataset: 3675, 56
Test dataset: 926, 56


In [5]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def cost_function(y, h):
    eps = sys.float_info.epsilon
    return y * np.log(h + eps) + (1 - y) * np.log(1 - h + eps)

def stats(data):
    def f(y, pred):
        if y == pred:
            return 1,0,0,0 if y == 1 else 0,1,0,0
        elif pred == 1:
            return 0,0,1,0
        else:
            return 0,0,0,1
    tp, tn, fp, fn = data.map(lambda x: f(x[2], x[4])).reduce(lambda a, b: tuple(map(sum, zip(a, b))))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    return precision, recall, f1, accuracy

In [6]:
class ParallelLogReg():
    
    def __init__(self, data, iterations, learning_rate, lambda_reg):
        self.data = data
        self.numberObservations = self.data.count()
        self.numberFeatures = len(self.data.first()[0])
        self.iterations = iterations
        self.lr = learning_rate
        self.lambda_reg = lambda_reg
        
    def __add_intercept(self, rdd):
        return rdd.map(lambda x: (1, x[0], x[1]))
        
    def train(self, train_rdd=None, SGD=False, SGD_pct=0.5, threshold=0.5):
        eps = sys.float_info.epsilon
        
        # initialize the weights
        # w[0]: bias weight
        # w[1]: rest of the weights
        w = (0, np.zeros(self.numberFeatures))
        
        if train_rdd == None:
            data = self.__add_intercept(self.data)
        else:
            data = self.__add_intercept(train_rdd)

        # initialize prediction to rdd
        # x[0]: bias/intercept
        # x[1]: rest features
        # x[2]: true y
        # adding x[3]: predicted y
        data = data.map(lambda x: (x[0], x[1], x[2], sigmoid(w[1].dot(x[1]) + w[0] * x[0])))
        numObs = data.count()
        
        train = data.cache()
        
        for i in range(self.iterations):
            start = time.time()
            
            # sample for SGD
            if SGD:
                train = data.sample(False, SGD_pct).repartition(sc.defaultParallelism).cache()
                if i == 0:
                    numObs = train.count()
            
            # compute derivatives
            temp = train.map(lambda x: ((x[3] - x[2]) * x[0], (x[3] - x[2]) * x[1])) \
                       .reduce(lambda a,b: (a[0] + b[0], a[1] + b[1]))
            dw = (temp[0]/numObs, (temp[1]/numObs) + (self.lambda_reg/numObs) * w[1])
            
            # update weights
            w = (w[0] - self.lr * dw[0], w[1] - self.lr * dw[1])
            
            # update prediction
            train = train.map(lambda x: (x[0], x[1], x[2], sigmoid(w[1].dot(x[1]) + w[0] * x[0]))).cache()
            
            if (i%10 == 0):
                end = time.time()
                
                model_stats = self.validate(train, w, add_intercept=False)

                print("Iteration: " + str(i) + ", Total Time: " + str(end - start))
                print('Precision: {}, Recall: {}, F1: {}, Accuracy: {}'.format(model_stats['precision'], model_stats['recall'], model_stats['f1'], model_stats['accuracy']))
                print('Current loss: {}\n'.format(model_stats['loss']))
                
        model_stats = self.validate(data, w, add_intercept=False)
        
        print('Training stats:')
        print('Precision: {}, Recall: {}, F1: {}, Accuracy: {}'.format(model_stats['precision'], model_stats['recall'], model_stats['f1'], model_stats['accuracy']))
        print('Final training loss: {}\n'.format(model_stats['loss']))
            
        return {**model_stats, 'w': w}
    
    def validate(self, val_rdd, w, threshold=0.5, add_intercept=True):
        if add_intercept:
            val_rdd = self.__add_intercept(val_rdd)
        
        numObs = val_rdd.count()
        
        # update prediction
        val_rdd = val_rdd.map(lambda x: (x[0], x[1], x[2], sigmoid(w[1].dot(x[1]) + w[0] * x[0])))
        
        loss = val_rdd.map(lambda x: cost_function(x[2], x[3])) \
                    .reduce(lambda a,b: a + b)
        loss = -(1/numObs) * loss + (self.lambda_reg/(2*numObs)) * np.sum(w[1]**2)
        
        # calculate stats
        val_rdd = val_rdd.map(lambda x: (x[0], x[1], x[2], x[3], 1 if x[3] >= threshold else 0)).cache()
        precision, recall, f1, accuracy = stats(val_rdd)
        return {'loss': loss, 'precision': precision, 'recall': recall, 'f1': f1, 'accuracy': accuracy}
    
    def cross_validate(self, k, SGD=False, SGD_pct = 0.5):
        
        if k == 1:
            print("Please choose a k > 1. Or use regular train function")
            return
        
        dataWithIndex = self.data.zipWithIndex();
        
        model_stats = []
        models = []
        
        for i in range(k):
            print("Fold: " + str(i))
            
            fold = dataWithIndex.filter(lambda x: x[1] % k != i).repartition(sc.defaultParallelism) \
                                .map(lambda x: x[0])
            
            model = self.train(fold, SGD, SGD_pct)
            models.append(model)
            
            val = dataWithIndex.filter(lambda x: x[1] % k == i).repartition(sc.defaultParallelism) \
                                .map(lambda x: x[0])
            
            val_stats = self.validate(val, model['w'])
            model_stats.append(val_stats)
            print('Validation stats:')
            print('Precision: {}, Recall: {}, F1: {}, Accuracy: {}'.format(val_stats['precision'], val_stats['recall'], val_stats['f1'], val_stats['accuracy']))
            print('Loss: {}\n'.format(val_stats['loss']))

        avg_model_stats = {}
        for key in model_stats[0].keys():
            avg_model_stats['avg_' + key] = sum(stat[key] for stat in model_stats) / len(model_stats)

        print('Averaged validation stats:')
        print('Precision: {}, Recall: {}, F1: {}, Accuracy: {}'.format(avg_model_stats['avg_precision'], avg_model_stats['avg_recall'], avg_model_stats['avg_f1'], avg_model_stats['avg_accuracy']))
        print('Loss: {}'.format(avg_model_stats['avg_loss']))

In [31]:
logReg = ParallelLogReg(data=train_set, iterations=50, learning_rate=0.1, lambda_reg=300)

In [32]:
logReg.cross_validate(2)

Fold: 0
Iteration: 0, Total Time: 0.15577244758605957
Precision: 0.941609977324263, Recall: 0.9579008073817762, F1: 0.949685534591195, Accuracy: 0.9041916167664671
Current loss: 0.6480631987559956

Iteration: 10, Total Time: 0.17256736755371094
Precision: 0.953062392673154, Recall: 0.9487179487179487, F1: 0.950885208452313, Accuracy: 0.9063690800217746
Current loss: 0.49338804081386234

Iteration: 20, Total Time: 0.17585110664367676
Precision: 0.9585730724971231, Recall: 0.9439093484419263, F1: 0.9511846988295746, Accuracy: 0.9069134458356015
Current loss: 0.464560669047611

Iteration: 30, Total Time: 0.16466712951660156
Precision: 0.9611594202898551, Recall: 0.9367231638418079, F1: 0.9487839771101574, Accuracy: 0.9025585193249864
Current loss: 0.45536512999974565

Iteration: 40, Total Time: 0.17987751960754395
Precision: 0.9621432731508445, Recall: 0.9322799097065463, F1: 0.946976210948696, Accuracy: 0.899292324442025
Current loss: 0.45139675760077885

Training stats:
Precision: 0.965

In [33]:
model = logReg.train()

Iteration: 0, Total Time: 0.677819013595581
Precision: 0.9399373754625676, Recall: 0.9532332563510393, F1: 0.9465386269170131, Accuracy: 0.8985034013605442
Current loss: 0.6494327684151125

Iteration: 10, Total Time: 0.18222594261169434
Precision: 0.9535150645624103, Recall: 0.9459151722174779, F1: 0.9496999142612174, Accuracy: 0.90421768707483
Current loss: 0.4760169922926443

Iteration: 20, Total Time: 0.19081568717956543
Precision: 0.9582132564841499, Recall: 0.9419263456090652, F1: 0.9500000000000001, Accuracy: 0.9047619047619048
Current loss: 0.4327974887610755

Iteration: 30, Total Time: 0.23614120483398438
Precision: 0.9621168305378832, Recall: 0.9387697516930023, F1: 0.950299914310197, Accuracy: 0.9053061224489796
Current loss: 0.41616010249959595

Iteration: 40, Total Time: 0.18167829513549805
Precision: 0.9655072463768116, Recall: 0.936726659167604, F1: 0.9508992292320867, Accuracy: 0.9063945578231293
Current loss: 0.4083520026880451

Training stats:
Precision: 0.966637655932

In [34]:
logReg.validate(test_set, model['w'])

{'loss': 0.6036606155393682,
 'precision': 0.9436619718309859,
 'recall': 0.9157175398633257,
 'f1': 0.9294797687861273,
 'accuracy': 0.8682505399568035}