In [53]:
import math
import random
import pyspark
from itertools import chain
from pprint import pprint
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

global k_fold
global n_row
global y_h_m_y
global n_columns
global w
global THRESHOLD

sc = pyspark.SparkContext.getOrCreate()
TRAIN_TEST = 0.8
k_fold=2
b = 0
THRESHOLD = 0.5

n_columns=56 #total number of columns (58) - the one deleted (57) - label (56)
col_sums=[]
averages=[]
sigmas=[]
w=[]

def initializeAccumulators():
    global averages
    global col_sums
    global sigmas
    i=0
    averages=[]
    col_sums=[]
    sigmas=[]
    while(i<n_columns):
        averages.append(sc.accumulator(0))
        col_sums.append(sc.accumulator(0))
        sigmas.append(sc.accumulator(0)) 
        i+=1

def addToAccumulators(row):
    if(len(row)!=len(col_sums)):
        raise Exception("Number of columns in the row doesn't mach the number of accumulators initiated. Len row: "+str(len(row))+" n_accomulators: "+str(len(col_sums)))
    i=0
    while(i<n_columns):
        col_sums[i].add(row[i])
        i+=1
    print("Col sum len: " + str(len(col_sums)))
        
def preprocessing(row):
    split=row.split(" ")       #splitting
    label = int(split[57])
    del split[57] #label
    del split[56] #col 57th
    split=[float(col) for col in split]
    addToAccumulators(split)
    #assign random key to the datapoint
    key=random.getrandbits(64)
    #assign train/test 
    if(random.random()<TRAIN_TEST):
        train=1
    else:
        train=0    
    return (key,train,split,label)
def calcAvg(n_row):
    i=0
    while(i<n_columns):
        averages[i]=col_sums[i].value/n_row
        i+=1
def calcResiduals(row):
    global sigmas
    i=0
    while(i<n_columns):       
        sigmas[i].add(math.pow(row[2][i]-averages[i],2))
        i+=1
    return row
def calcSigmas(n_row):
    global sigmas
    i=0
    while(i<n_columns):
        sigmas[i]=math.sqrt(sigmas[i].value/float(n_row-1))
        i+=1

def normalize(row):
    i=0
    while(i<n_columns):
        row[2][i]=(row[2][i]-averages[i])/sigmas[i]
        i+=1
    return row

def initializeWeights(random_init=False):
    if(random_init):
        #return sc.parallelize([(i, random.random()) for i in range(0,n_columns)])
        return [random.random() for i in range(0,n_columns)]
    else:
        #return sc.parallelize([(i, 0.0) for i in range(0,n_columns)])
        return [0.0 for i in range(0,n_columns)]

def initializeBias():
    return sc.parallelize(0.0)

def sigmoid(z):
    return 1/(1+math.exp(-z))

def predict(w,b,X):
    return sigmoid(sum([X[i]*w[i] for i in range(len(w))])+b)

def classify_prediction(pred_probability):
    if (pred_probability >= THRESHOLD):
        return 1
    return 0

def predict_parallel(w,b,X):
    #Change X representation
    X = sc.parallelize(X).zipWithIndex().map(lambda x: (x[1],x[0]))
    wX_plus_b=X.join(w).map(lambda x: (x[1][0]*x[1][1])+b).sum()
    return sigmoid(wX_plus_b)

def compute_cost(dataset,w,b,lambda_reg, print_stats=False):
    cost = (-1/dataset.count())*dataset.map(lambda x: x[3]*math.log(predict(w,b,x[2])) \
                            +(1-x[3])*math.log(1-predict(w,b,x[2]))).sum() \
                            + lambda_reg/(2*dataset.count())*sum([i*i for i in w]);
    if (print_stats):
        stats = {"TP":0, "TN":0, "FP":0, "FN":0}
        statsMapping = dataset.map(lambda x: 2*(classify_prediction(predict(w, b, x[2])) - x[3]) + x[3])  #this function maps all 4 possible variations to an integer, from -1 to 3
        stats["TP"] = statsMapping.filter(lambda x: x == 1).count()
        stats["TN"] = statsMapping.filter(lambda x: x == 2).count()
        stats["FP"] = statsMapping.filter(lambda x: x == -1).count()
        stats["FN"] = statsMapping.filter(lambda x: x == 0).count()
        precision = stats["TP"]/(stats["TP"] + stats["FP"])
        recall = stats["TP"]/(stats["TP"] + stats["FN"])
        print("Precision: " + str(precision))
        print("Recall: " + str(recall))
        print("F1 score: " + str(2*(precision*recall)/(precision + recall)))
    return cost

def make_folds(x):
    return (x[0],int(random.random()*10-1),x[1],x[2])

def gradientDescent(iterations,train,w,number_samples,lambda_reg,learning_rate,b):
    costs=[]
    for iteration in range(iterations):
        w,b = gradientDescentIteration(train,w,number_samples,lambda_reg,learning_rate,b)
        cost=compute_cost(train,w,b,lambda_reg)
        costs.append(cost)
        print("-> Iteration done: "+str(iteration+1)+" of "+str(iterations)+". Cost: "+str(cost))
    return w,b,costs
    

def gradientDescentIteration(train,w,number_samples,lambda_reg,learning_rate,b):
    dw=[i for i in range(0,n_columns)]
    j = 0
    for j in range(n_columns):
        X_j=train.map(lambda x: (predict(w,b,x[2])-x[3])*x[2][j]).sum()
        dw[j]=(1/number_samples)*X_j+(lambda_reg/number_samples)+w[j]
        w[j]-=learning_rate*dw[j]
    b-=learning_rate*(1/number_samples)*train.map(lambda x: predict(w,b,x[2])-x[3]).sum()
    return w,b


def kFoldsCV(k_fold,iterations,train,lambda_reg,learning_rate):
    fold_length=train.count()/k_fold
    train_errors_fold=[]
    test_errors_fold = []
    for i_fold in range(k_fold):
        w=initializeWeights()
        starting_fold=fold_length*i_fold
        end_fold=starting_fold+fold_length
        test_fold=train.zipWithIndex().filter(lambda t: (t[1]>=starting_fold and t[1]<end_fold)).map(lambda t: t[0]) #the map get rid of the index again
        train_fold=train.zipWithIndex().filter(lambda t: t[1]<starting_fold or t[1]>=end_fold).map(lambda t: t[0])
        train_fold_size=train_fold.count()
        b=0
        #Gradient descent
        w, b, train_errors = gradientDescent(iterations,train_fold,w,train_fold_size,lambda_reg,learning_rate,b)
        train_errors_fold.append(train_errors)
        train_errors_flattened =list(chain.from_iterable(train_errors_fold))
        test_error = compute_cost(test_fold, w, b, lambda_reg)
        test_errors_fold.append(test_error)
        print("--> Fold #"+str(i_fold+1)+" of "+str(k_fold)+" is done. Train error: " \
              +str(sum(train_errors_flattened)/((i_fold + 1) * iterations)) \
              + " Test error: " + str(sum(test_errors_fold)/((i_fold+ 1) * iterations)))
    return w,b,train_errors_fold, test_errors_fold


def trainTestSplit(dataset):
    dataset=dataset.map(normalize)
    train=dataset.filter(lambda x: x[1]==1)
    train_size=train.count()
    test=dataset.filter(lambda x: x[1]==0)
    return train, test

def testParameters(test, w, b):
    test_error = test.map(lambda x: ())

def train(filename, iterations, learning_rate, lambda_reg, cv=False):
    global w
    global b
    w = []
    initializeAccumulators()
    w = initializeWeights()
    dataset=sc.textFile(filename).map(preprocessing).sortBy(lambda x: x[0])
    n_row=dataset.count()
    calcAvg(n_row)
    gg = dataset.map(calcResiduals).collect()
    calcSigmas(n_row)
    train, test = trainTestSplit(dataset)
    print("Split train/test done. Train contains "+str(train.count())+" elements, Test contains "+str(test.count())+" elements")
    if (cv):
        w, b, train_errors_fold, test_errors_fold = kFoldsCV(k_fold,iterations,train,lambda_reg,learning_rate)
        train_errors_fold =list(chain.from_iterable(train_errors_fold))
        average_train_error = sum(train_errors_fold)/(iterations*k_fold)
        average_test_error = sum(test_errors_fold)/(iterations*k_fold)
        print("---> "+str(k_fold)+"-fold validation done.  Train error: " \
              +str(average_train_error) \
              + " Test error: " + str((average_test_error)))
        return average_test_error
    else:                                                  #Testing the parameters and weights on the actual test set
        w,b, train_errors = gradientDescent(iterations, train, w, train.count(), lambda_reg, learning_rate, b)
        average_train_error = sum(train_errors)/iterations
        average_test_error = compute_cost(test, w, b, lambda_reg, True)
        print("---> Model done. Train error: " \
              +str(average_train_error) \
              + " Test error: " + str((average_test_error)))
        return average_test_error
            

grid=[]
grid+=[[(i*0.4,i*0.01),(i*0.002,i*0.001)] for i in range(1,2)]
grid=list(chain.from_iterable(grid))
grid_results = []

for par in grid:
    print("|---- Starting training for parameters (learning_rate,lambda_reg) = "+str(par)+" ----|")
    grid_results.append((par, train("dataset/spam.data",8,par[0],par[1], True)))
    print("--------------------------------------------------\n")
    print(grid_results)
    
best_par = sorted(grid_results, key=lambda x: x[1])[0]
print("Best parameters: " + str(best_par))
smallest_error = train("dataset/spam.data",1,best_par[0][0],best_par[0][1])
print("Best performance: " + str(smallest_error))
    

|---- Starting training for parameters (learning_rate,lambda_reg) = (0.4, 0.01) ----|
Split train/test done. Train contains 3700 elements, Test contains 901 elements
-> Iteration done: 1 of 8. Cost: 0.5693881269216057
-> Iteration done: 2 of 8. Cost: 0.5410520101690167
-> Iteration done: 3 of 8. Cost: 0.5299364276860912
-> Iteration done: 4 of 8. Cost: 0.525229185806913


KeyboardInterrupt: 

In [None]:

grid