In [2]:
import os

from pyspark import SparkContext


def readFile(filename):
    """
    Return an RDD containing the data of filename.
    Each example (row) of the file corresponds to one RDD record.
    Each record of the RDD is a tuple (X,y). “X” is an array containing the 11
    features (float number) of an example
    “y” is the 12th column of an example (integer 0/1)
    """

    current_directory = os.getcwd()
    parent_directory = os.path.dirname(current_directory)
    sc = SparkContext.getOrCreate()
    data = sc.textFile(parent_directory + "/" + filename)
    processed_data = data.map(lambda line: line.split(",")).map(
        lambda cols: ([float(x) for x in cols[:11]], float(cols[11]))
    )
    return processed_data


In [3]:
import numpy as np
from pyspark import SparkContext


def normalize(RDD_Xy):
    sc = SparkContext.getOrCreate()

    number_of_samples = RDD_Xy.count()

    # Function to compute sum and sum of squares for each feature
    def compute_sum_and_squares(record):
        X, _ = record
        return (np.array(X), np.array(X) ** 2)

    # Aggregate the sum and sum of squares for each feature, and count the examples
    sum_squares_count = RDD_Xy.map(compute_sum_and_squares).reduce(
        lambda a, b: (a[0] + b[0], a[1] + b[1])
    )

    # Calculate the mean and variance for each feature
    mean = sum_squares_count[0] / number_of_samples
    variance = (sum_squares_count[1] / number_of_samples) - mean**2
    std_dev = np.sqrt(variance)

    # Replace zeros in standard deviation with ones to avoid division by zero
    std_dev[std_dev == 0] = 1

    # Broadcast the mean and std_dev to all the nodes
    broadcast_mean = sc.broadcast(mean)
    broadcast_std_dev = sc.broadcast(std_dev)

    # Function to normalize features
    def normalize_features(record):
        X, y = record
        X_normalized = (X - broadcast_mean.value) / broadcast_std_dev.value
        return (X_normalized, y)

    # Normalize each feature and return the new RDD
    return RDD_Xy.map(normalize_features)


In [4]:
import random


def transform(RDD_Xy, num_blocks):
    # Function to assign a random index to each record
    def add_index(record):
        # Randomly select an index between 0 and num_blocks - 1
        index = random.randint(0, num_blocks - 1)
        # Return a new record with the index added
        return (record, index)

    # Map each record in the RDD to include a random fold index
    RDD_tranformed = RDD_Xy.map(add_index)
    return RDD_tranformed


In [5]:
def get_block_data(data_cv, block_numb):
    """
    Splits the input RDD into two RDDs based on the index value.

    :param data_cv: An RDD where each row is a list of size 11, y, and index.
    :param block_numb: The block number to split the RDD.
    :return: A tuple of two RDDs (tr_data, test_data).
    """

    # Split the data into two RDDs based on the index value

    def filter_block_train(record):
        if record[1] != block_numb:
            return [record[0]]
        else:
            return []

    def filter_block_test(record):
        if record[1] == block_numb:
            return [record[0]]
        else:
            return []

    tr_data = data_cv.flatMap(filter_block_train)
    test_data = data_cv.flatMap(filter_block_test)

    return tr_data, test_data


In [6]:
import numpy as np
from pyspark import SparkContext


def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    sc = SparkContext.getOrCreate()

    # Number of features (assuming all records have the same number of features)
    k = len(RDD_Xy.first()[0])
    m = RDD_Xy.count()  # Total number of examples

    np.random.seed(0)  # For reproducibility
    w = np.random.rand(k)  # Weight vector
    b = np.random.rand()  # Bias term

    for i in range(iterations):
        # Broadcast weights and bias
        broadcast_w = sc.broadcast(w)
        broadcast_b = sc.broadcast(b)

        # Compute gradients
        gradients = RDD_Xy.map(
            lambda x: compute_gradients(x, broadcast_w.value, broadcast_b.value, k)
        ).reduce(lambda a, b: (a[0] + b[0], a[1] + b[1]))

        # Update weights and bias
        w -= learning_rate * ((1 / m) * gradients[0] + (lambda_reg / k) * w)
        b -= learning_rate * (
            (1 / m) * gradients[1] + ((lambda_reg / (2 * k)) * np.sum(w**2))
        )

        # Optional: Print cost for monitoring (not recommended for large datasets)
        cost = RDD_Xy.map(
            lambda x: compute_cost(x, broadcast_w.value, broadcast_b.value, k)
        ).reduce(lambda x, y: x + y)
        cost = (-1 / m) * cost
        cost += (lambda_reg / (2 * k)) * np.sum(w**2)
        print(f"Iteration {i}, Cost: {cost}")

    return w, b


def compute_gradients(record, w, b, k):
    X, y = record
    z = 0
    for i in range(k):
        z += X[i] * w[i]
    z += b
    y_hat = 1 / (1 + np.exp(-z))
    dw = (y_hat - y) * X
    db = np.sum(y_hat - y)
    return dw, db


def compute_cost(record, w, b, k):
    X, y = record
    z = 0
    for i in range(k):
        z += X[i] * w[i]
    z += b
    y_hat = 1 / (1 + np.exp(-z))
    cost = y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)
    return cost


In [7]:
def accuracy(w, b, RDD_xy):
    prediction_results = RDD_xy.map(
        lambda record: 1 if predict(w, b, record[0]) == record[1] else 0
    )

    # Step 2: Use reduce to sum up the correct predictions
    correctly_classified = prediction_results.reduce(lambda a, b: a + b)

    # Step 4: Calculate accuracy
    accuracy = correctly_classified / RDD_xy.count()
    return accuracy


In [8]:
import numpy as np


def predict(w, b, X):
    # Initialize the sum
    z = 0
    # Iterate over the weights and corresponding features
    for i in range(len(w)):
        z += w[i] * X[i]
        # Add the bias term
    z += b

    # compact way to calculate it z = np.dot(w, X) + b

    # Apply the sigmoid function to get the probability
    p = 1 / (1 + np.exp(-z))
    # Predict the class label (0 or 1) based on the probability
    if p >= 0.5:
        return 1
    else:
        return 0


In [9]:
import os
import sys


import pyspark
from pyspark import SparkContext


if __name__ == "__main__":
    os.environ["PYSPARK_PYTHON"] = sys.executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable



    sc = SparkContext.getOrCreate()

    # read data
    data = readFile("/data/botnet_tot_syn_l.csv")
    # standardize
    data = normalize(data)

    num_blocks_cv = 10
    # Shuffle rows and transfrom data
    data_cv = transform(data, num_blocks_cv)
    # optimize performance
    data_cv_cached = data_cv.cache()

    accuracies = []
    for i in range(num_blocks_cv):
        tr_data, test_data = get_block_data(data_cv_cached, i)
        # optimize performance
        tr_data_cached = tr_data.cache()
        test_data_cached = test_data.cache()
        weights, bias = train(tr_data_cached, 10, 1.5, 0.05)
        acc = accuracy(weights, bias, test_data_cached)
        accuracies.append(acc)
        print("accuracy:", acc)
        print("------------------------------------------------------")
    avg_acc = 0
    for a in accuracies:
        avg_acc += a
    print("average accuracy:", avg_acc / num_blocks_cv)


23/12/16 18:55:23 WARN BlockManager: Task 18 already completed, not releasing lock for rdd_5_0
                                                                                

Iteration 0, Cost: 1.505884341907097


                                                                                

Iteration 1, Cost: 0.745648693748846


                                                                                

Iteration 2, Cost: 0.44937269010142605


                                                                                

Iteration 3, Cost: 0.3420726405130148


                                                                                

Iteration 4, Cost: 0.29422503838917535


                                                                                

Iteration 5, Cost: 0.26839041019626075


                                                                                

Iteration 6, Cost: 0.25249610313120413


                                                                                

Iteration 7, Cost: 0.24182954662881495


                                                                                

Iteration 8, Cost: 0.23422777914829374


                                                                                

Iteration 9, Cost: 0.22856942375404316


                                                                                

accuracy: 0.9324629174901679
------------------------------------------------------


23/12/16 18:56:12 WARN BlockManager: Task 226 already completed, not releasing lock for rdd_31_0
                                                                                

Iteration 0, Cost: 1.5069533337175396


                                                                                

Iteration 1, Cost: 0.7455060464017758


                                                                                

Iteration 2, Cost: 0.44905505356062836


                                                                                

Iteration 3, Cost: 0.3417313548778674


                                                                                

Iteration 4, Cost: 0.2938725021842276


                                                                                

Iteration 5, Cost: 0.268033497737019


                                                                                

Iteration 6, Cost: 0.2521381434873756


                                                                                

Iteration 7, Cost: 0.24147187177869636


                                                                                

Iteration 8, Cost: 0.2338707935672121


                                                                                

Iteration 9, Cost: 0.22821312538975086


                                                                                

accuracy: 0.9299281851051326
------------------------------------------------------


23/12/16 18:57:00 WARN BlockManager: Task 434 already completed, not releasing lock for rdd_57_0
                                                                                

Iteration 0, Cost: 1.506683526659097


                                                                                

Iteration 1, Cost: 0.7455656507235368


                                                                                

Iteration 2, Cost: 0.44915211224199436


                                                                                

Iteration 3, Cost: 0.3418517175825034


                                                                                

Iteration 4, Cost: 0.294002177571909


                                                                                

Iteration 5, Cost: 0.26816513773921763


                                                                                

Iteration 6, Cost: 0.2522686012726715


                                                                                

Iteration 7, Cost: 0.24160010453579386


                                                                                

Iteration 8, Cost: 0.23399671295891655


                                                                                

Iteration 9, Cost: 0.22833702537308287


                                                                                

accuracy: 0.9303591509349468
------------------------------------------------------


23/12/16 18:57:52 WARN BlockManager: Task 642 already completed, not releasing lock for rdd_83_0
                                                                                

Iteration 0, Cost: 1.5065565223769908


                                                                                

Iteration 1, Cost: 0.7456045417654896


                                                                                

Iteration 2, Cost: 0.44908825803604696


                                                                                

Iteration 3, Cost: 0.34178207645366493


                                                                                

Iteration 4, Cost: 0.29396095237974684


                                                                                

Iteration 5, Cost: 0.2681423314607115


                                                                                

Iteration 6, Cost: 0.25225543304506864


                                                                                

Iteration 7, Cost: 0.24159168785480872


                                                                                

Iteration 8, Cost: 0.2339904371281086


                                                                                

Iteration 9, Cost: 0.22833148296952752


                                                                                

accuracy: 0.9314784637978555
------------------------------------------------------


23/12/16 18:58:41 WARN BlockManager: Task 850 already completed, not releasing lock for rdd_109_0
                                                                                

Iteration 0, Cost: 1.5062687519127558


                                                                                

Iteration 1, Cost: 0.7457370613386319


                                                                                

Iteration 2, Cost: 0.44924321503374887


                                                                                

Iteration 3, Cost: 0.3419460147522397


                                                                                

Iteration 4, Cost: 0.2941127626782691


                                                                                

Iteration 5, Cost: 0.2682802173846976


                                                                                

Iteration 6, Cost: 0.2523830068463666


                                                                                

Iteration 7, Cost: 0.24171235554930073


                                                                                

Iteration 8, Cost: 0.2341066036564616


                                                                                

Iteration 9, Cost: 0.22844473918427527


                                                                                

accuracy: 0.9317105815138137
------------------------------------------------------


23/12/16 18:59:29 WARN BlockManager: Task 1058 already completed, not releasing lock for rdd_135_0
                                                                                

Iteration 0, Cost: 1.5061716357401513


                                                                                

Iteration 1, Cost: 0.7455478711686634


                                                                                

Iteration 2, Cost: 0.44937357300553826


                                                                                

Iteration 3, Cost: 0.3421139873826824


                                                                                

Iteration 4, Cost: 0.2942567236597653


                                                                                

Iteration 5, Cost: 0.26840021434940886


                                                                                

Iteration 6, Cost: 0.25248399985304826


                                                                                

Iteration 7, Cost: 0.24179854950317378


                                                                                

Iteration 8, Cost: 0.23418123673894117


                                                                                

Iteration 9, Cost: 0.22851027371324636


                                                                                

accuracy: 0.9314245529435403
------------------------------------------------------


23/12/16 19:00:16 WARN BlockManager: Task 1266 already completed, not releasing lock for rdd_161_0
                                                                                

Iteration 0, Cost: 1.5057671085297977


                                                                                

Iteration 1, Cost: 0.7454007905240057


                                                                                

Iteration 2, Cost: 0.4490735089292751


                                                                                

Iteration 3, Cost: 0.34180523757127723


                                                                                

Iteration 4, Cost: 0.2939880657807989


                                                                                

Iteration 5, Cost: 0.2681703116194938


                                                                                

Iteration 6, Cost: 0.25228494043062666


                                                                                

Iteration 7, Cost: 0.24162295992522936


                                                                                

Iteration 8, Cost: 0.23402330790769008


                                                                                

Iteration 9, Cost: 0.22836564360816358


                                                                                

accuracy: 0.9310915270550726
------------------------------------------------------


23/12/16 19:01:04 WARN BlockManager: Task 1474 already completed, not releasing lock for rdd_187_0
                                                                                

Iteration 0, Cost: 1.5072514719350498


                                                                                

Iteration 1, Cost: 0.7454814737561628


                                                                                

Iteration 2, Cost: 0.4487435274935326


                                                                                

Iteration 3, Cost: 0.3415009346134133


                                                                                

Iteration 4, Cost: 0.2937353792150449


                                                                                

Iteration 5, Cost: 0.2679463433904322


                                                                                

Iteration 6, Cost: 0.25207575411955613


                                                                                

Iteration 7, Cost: 0.24142178759606267


                                                                                

Iteration 8, Cost: 0.23382681160017885


                                                                                

Iteration 9, Cost: 0.22817206333668863


                                                                                

accuracy: 0.9307894051932971
------------------------------------------------------


23/12/16 19:01:56 WARN BlockManager: Task 1682 already completed, not releasing lock for rdd_213_0
                                                                                

Iteration 0, Cost: 1.5063459821119392


                                                                                

Iteration 1, Cost: 0.745595715111863


                                                                                

Iteration 2, Cost: 0.449046454929582


                                                                                

Iteration 3, Cost: 0.34172315931580294


                                                                                

Iteration 4, Cost: 0.29387851954899363


                                                                                

Iteration 5, Cost: 0.26803995714493134


                                                                                

Iteration 6, Cost: 0.25213789458056196


                                                                                

Iteration 7, Cost: 0.24146276998868965


                                                                                

Iteration 8, Cost: 0.23385289264833115


                                                                                

Iteration 9, Cost: 0.22818730553718167


                                                                                

accuracy: 0.9303166242765305
------------------------------------------------------


23/12/16 19:02:46 WARN BlockManager: Task 1890 already completed, not releasing lock for rdd_239_0
                                                                                

Iteration 0, Cost: 1.506462323682711


                                                                                

Iteration 1, Cost: 0.7452683349949288


                                                                                

Iteration 2, Cost: 0.4489062491760523


                                                                                

Iteration 3, Cost: 0.34166411656006807


                                                                                

Iteration 4, Cost: 0.29383723439019677


                                                                                

Iteration 5, Cost: 0.26800553272012784


                                                                                

Iteration 6, Cost: 0.2521083972401415


                                                                                

Iteration 7, Cost: 0.2414374456760118


                                                                                

Iteration 8, Cost: 0.23383110691820616


                                                                                

Iteration 9, Cost: 0.2281684437300812


                                                                                

accuracy: 0.9304781928515201
------------------------------------------------------
average accuracy: 0.9310039601161879
