In [None]:
import warnings
warnings.filterwarnings("ignore")
import findspark
findspark.init()
from pyspark import SparkContext
import numpy as np
sc = SparkContext("local[*]", "Name of the Program")


In [203]:
def readFile(filename):
    rdd = sc.textFile(filename)
    rdd_numpy = rdd.map(lambda x : np.array(x.split(","), dtype = float))
    rdd_return = rdd_numpy.map(lambda x : (x[:-1], int(x[-1])))
    return rdd_return

def normalize(RDD_Xy):
    
    suma = RDD_Xy.reduce(lambda x,y: (np.array(x[0])+np.array(y[0]),x[1]))
    total_filas = float(RDD_Xy.count())
    media = suma[0] / total_filas

    rdd_2 = RDD_Xy.map(lambda x: (x[0],x[1], media)) 

    restas = rdd_2.map(lambda x: (x[0]-x[2])**2)
    sumas = restas.reduce(lambda x,y: x+y)
    desviacion = (sumas / total_filas)**0.5

    def f(x):
        a = (x[0] - media) / desviacion
        a= np.nan_to_num(a, copy=False,nan=0.0)
        return (a,x[1])

    normalizado = RDD_Xy.map(f)
    return normalizado

def predict(w,b,X,threshold=0.5):
    rdd=X.map(lambda x:np.dot(w,np.array(x[0])+b))
    logistic= rdd.map(lambda x: 1/(1+np.exp(-x)))
    results=logistic.map(lambda x: 1 if x>threshold else 0)
    return results

def get_predictions_targets_inputs(w,b,X,threshold=0.5):
    """
    Get the predictions, targets and inputs
    :param w: weights
    :param b: bias
    :param X: RDD with the data
    :param threshold: threshold to classify the data
    :return: RDD with the tuple predictions, targets and inputs
    """
    rdd=X.map(lambda x:(np.dot(w,np.array(x[0])+b),x[1],x[0]))
    logistic= rdd.map(lambda x: (1/(1+np.exp(-x[0])),x[1],x[2]))
    results=logistic.map(lambda x: ((0.999,x[1],x[2]) if x[0]>threshold else (0.001,x[1],x[2]))) #para evitar log(0)=-inf y log(1)=0
    return results

# TODO preguntar si la b se incluye en la perdida, porque en la formula del enunciado no aparece

def calculate_loss_function(predictions_and_targets,lambda_reg,w):   
    """
    Calculate the loss function for logistic regression
    :param predictions_and_targets: RDD with the predictions and the targets
    :param lambda_reg: regularization parameter
    :param w: weights
    :return: loss function
    """


    def f(x):
        return (-x[1]*np.log(x[0])-(1-x[1])*np.log(1-x[0]))

    loss=predictions_and_targets.map(f)
    suma=loss.reduce(lambda x,y: x+y)/predictions_and_targets.count()
    squared_w=np.dot(w,w)
    squared_w=squared_w*lambda_reg/(2*len(w))
    return suma+squared_w



# TODO pregunta si el for de los gradientes se puede hacer así

def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    w = np.random.rand(11)
    b=np.random.rand(1)
    for iteration in range(iterations):
        predictions_targets_inputs=get_predictions_targets_inputs(w,b,RDD_Xy)
        for i in range(len(w)):
            gradient=predictions_targets_inputs.map(lambda x: (x[0]-x[1])*x[2][i]).reduce(lambda x,y: x+y)/predictions_targets_inputs.count()
            gradient=gradient+(lambda_reg*w[i])/len(w)
            w[i]=w[i]-learning_rate*gradient
        b_gradient=predictions_targets_inputs.map(lambda x: x[0]-x[1]).reduce(lambda x,y: x+y)/predictions_targets_inputs.count()
        b=b-learning_rate*b_gradient
        loss=calculate_loss_function(predictions_targets_inputs,lambda_reg,w)
        print("Iteration: ",iteration," Loss: ",loss)

    return w,b

def accuracy(w,b,RDD_Xy):
    predictions_and_targets=get_predictions_targets_inputs(w,b,RDD_Xy)
    correct=predictions_and_targets.map(lambda x: 1 if round(x[0])==x[1] else 0)
    correct=correct.reduce(lambda x,y: x+y)
    return correct/RDD_Xy.count()

In [193]:
n = readFile('data/botnet_reduced_10k_l.csv')

In [194]:
normalizado = normalize(n)

In [207]:
w,b=train(normalizado,10,1.5,0)

                                                                                

Iteration:  0  Loss:  2.399025759480246


                                                                                

Iteration:  1  Loss:  1.3360761990462704


                                                                                

Iteration:  2  Loss:  0.8305017492493122


                                                                                

Iteration:  3  Loss:  0.6909853027205992


                                                                                

Iteration:  4  Loss:  0.5991254641645511
Iteration:  5  Loss:  0.5452527768910823


                                                                                

Iteration:  6  Loss:  0.5100283275199671


                                                                                

Iteration:  7  Loss:  0.495524142484798
Iteration:  8  Loss:  0.4761852291045759
Iteration:  9  Loss:  0.46099036859154585


In [208]:
acc=accuracy(w,b,normalizado)

In [209]:
print(f"accuracy: {acc}")

accuracy: 0.9334
