# PROYECTO

In [1]:
import gzip
import math
import os
import re
import sys
from enum import Enum
from io import StringIO
from time import time

import matplotlib.pyplot as pyplot
import numpy
import pandas
import requests
pandas.options.mode.chained_assignment = None

PRNG_SEED = 42

numpy.random.seed(PRNG_SEED)


# create 'data' cache directory
if not os.path.exists('data'):
    os.path.makedirs('data')

DATASET_COLUMNS_FILE = os.path.join("data", "kddcup1999_columns.txt")

if not os.path.exists(DATASET_COLUMNS_FILE):
    with requests.get("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names") as request:
        with open(DATASET_COLUMNS_FILE, 'wb') as file:
            file.write(request.content)


ColumnType = Enum('ColumnType', 'SYMBOLIC CONTINUOUS')
column_types = {}

with open(DATASET_COLUMNS_FILE, 'r') as file:
    column_labels: str = file.read()

column_regex: re.Pattern = re.compile(r"^(?P<column_name>\w+): (?P<data_type>\w+)\.$")
for column_type in column_labels.splitlines()[1:]:
    match = column_regex.match(column_type)
    column_types[match.group("column_name")] = ColumnType[match.group("data_type").upper()]


DATASET_URL = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
DATASET_FILE = os.path.join("data", "kddcup1999.data_10_percent.csv")

# download dataset if not already cached
if not os.path.exists(DATASET_FILE):
    with requests.get(DATASET_URL) as response:
        if response.status_code != 200:
            raise RuntimeError(f"failed to download dataset: {DATASET_URL}")
        # decompress content
        with open(DATASET_FILE, 'wb') as file:
            file.write(gzip.decompress(response.content))

dataframe = pandas.read_csv(
    # file to import from
    DATASET_FILE,
    # important to specify the CSV file contains no headers
    # otherwise, the first record is interpreted as a header
    header=None,
)
dataframe.columns = [*column_types.keys(), "outcome"]

pandas.set_option('display.max_rows', 10)

unique_dataframe = dataframe.drop_duplicates()

encoded_dataframe = pandas.get_dummies(
    unique_dataframe,
    # all categorical columns
    columns=[column_name for column_name, column_type in column_types.items() if column_type == ColumnType.SYMBOLIC],
    # drop original column
    drop_first=True,
)

pandas.set_option('display.max_rows', 10)
encoded_dataframe



Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,land_1,logged_in_1,is_guest_login_1
0,0,181,5450,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0,239,486,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,0,235,1337,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,219,1337,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0,217,2032,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0,310,1881,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
494017,0,282,2286,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
494018,0,203,1200,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
494019,0,291,1200,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [2]:
msk = numpy.random.rand(len(unique_dataframe)) < 0.1
dff = unique_dataframe[msk]

In [3]:
#cols_to_use = ["service", "src_bytes", "dst_bytes", "rerror_rate", "dst_host_srv_count", "dst_host_diff_srv_rate"]
#col2u=[column_name for column_name, column_type in column_types.items() if column_type == ColumnType.CONTINUOUS]
#print(col2u)
col2u=["src_bytes", "dst_bytes", "rerror_rate", "dst_host_srv_count", "dst_host_diff_srv_rate"]

In [4]:
def get_df():
    msk = numpy.random.rand(len(unique_dataframe)) < 0.1
    df = unique_dataframe[msk]

    df[col2u] = df[col2u].astype(float)
    pandas.options.mode.chained_assignment = None
    min_col = {}
    max_col = {}
    for index, row in df.iterrows():
        for col in col2u:
            if col not in min_col or min_col[col] > row[col]:
                min_col[col] = row[col]
            if col not in max_col or max_col[col] < row[col]:
                max_col[col] = row[col]

    for i in range(len(df)):
        row = df[col][df.iloc[i].name]
        for col in col2u:
            df[col][df.iloc[i].name] = float(abs(float(df[col][df.iloc[i].name]-min_col[col])) / (abs(float(max_col[col] - min_col[col])) if abs(max_col[col] - min_col[col]) else 1))

    return df            

In [5]:
num_particles = 16
num_classes = 2

In [6]:
#fitness_func([unique_dataframe.iloc[0], unique_dataframe.iloc[7448]])

In [7]:
#start_particle = [unique_dataframe.iloc[0], unique_dataframe.iloc[7448]]

In [8]:
"""
pandas.options.mode.chained_assignment = None
min_col = {}
max_col = {}
for index, row in df.iterrows():
    for col in col2u:
        if col not in min_col or min_col[col] > row[col]:
            min_col[col] = row[col]
        if col not in max_col or max_col[col] < row[col]:
            max_col[col] = row[col]

for i in range(len(df)):
    row = df[col][df.iloc[i].name]
    for col in col2u:
        #print(row[col])
        #tmp = float(abs(float(df[col][df.iloc[i].name]-min_col[col])) / (abs(float(max_col[col] - min_col[col])) if abs(max_col[col] - min_col[col]) else 1))
        #print(tmp)
        df[col][df.iloc[i].name] = float(abs(float(df[col][df.iloc[i].name]-min_col[col])) / (abs(float(max_col[col] - min_col[col])) if abs(max_col[col] - min_col[col]) else 1))
        #print(row[col])
"""

'\npandas.options.mode.chained_assignment = None\nmin_col = {}\nmax_col = {}\nfor index, row in df.iterrows():\n    for col in col2u:\n        if col not in min_col or min_col[col] > row[col]:\n            min_col[col] = row[col]\n        if col not in max_col or max_col[col] < row[col]:\n            max_col[col] = row[col]\n\nfor i in range(len(df)):\n    row = df[col][df.iloc[i].name]\n    for col in col2u:\n        #print(row[col])\n        #tmp = float(abs(float(df[col][df.iloc[i].name]-min_col[col])) / (abs(float(max_col[col] - min_col[col])) if abs(max_col[col] - min_col[col]) else 1))\n        #print(tmp)\n        df[col][df.iloc[i].name] = float(abs(float(df[col][df.iloc[i].name]-min_col[col])) / (abs(float(max_col[col] - min_col[col])) if abs(max_col[col] - min_col[col]) else 1))\n        #print(row[col])\n'

In [9]:
df = get_df()

In [10]:
"""
not_normal_out = []
normal_out = []
for index, row in df.iterrows():
    if row["outcome"] != "normal.":
        not_normal_out.append(row[col2u+['outcome']])
    else:
        normal_out.append(row[col2u+['outcome']])
"""

'\nnot_normal_out = []\nnormal_out = []\nfor index, row in df.iterrows():\n    if row["outcome"] != "normal.":\n        not_normal_out.append(row[col2u+[\'outcome\']])\n    else:\n        normal_out.append(row[col2u+[\'outcome\']])\n'

In [11]:
"""
num_particles = 15
start_particles = []
import random
for _ in range(num_particles):
    p = []
    normal_index = normal_out[random.randint(0,len(normal_out))]
    not_normal_index = not_normal_out[random.randint(0,len(not_normal_out))]
    p.append(normal_index)
    p.append(not_normal_index)
    start_particles.append(p)
"""

'\nnum_particles = 15\nstart_particles = []\nimport random\nfor _ in range(num_particles):\n    p = []\n    normal_index = normal_out[random.randint(0,len(normal_out))]\n    not_normal_index = not_normal_out[random.randint(0,len(not_normal_out))]\n    p.append(normal_index)\n    p.append(not_normal_index)\n    start_particles.append(p)\n'

In [12]:
not_normal_out = None
normal_out = None
def get_start_particles(not_normal_out = None, normal_out = None, num_particles = 16):
    if not not_normal_out or not normal_out:
        not_normal_out = []
        normal_out = []
        for index, row in df.iterrows():
            if row["outcome"] != "normal.":
                not_normal_out.append(row[col2u+['outcome']])
            else:
                normal_out.append(row[col2u+['outcome']])

    start_particles = []
    import random
    for _ in range(num_particles):
        p = []
        normal_index = normal_out[random.randint(0,len(normal_out))]
        not_normal_index = not_normal_out[random.randint(0,len(not_normal_out))]
        p.append(normal_index)
        p.append(not_normal_index)
        start_particles.append(p)
    
    return start_particles, not_normal_out, normal_out

In [13]:
start_particles, not_normal_out, normal_out = get_start_particles()

In [14]:
len(normal_out)

8837

In [15]:

def fitness_func(particle_classes):
    num_err = 0
    total = 0
    for index, row in df.iterrows():
        min_dist = None
        for particle in particle_classes:
            dist = 0
            for c in col2u:
                dist += (float(particle[c])-float(row[c]))**2
            dist = dist**.5
            if min_dist == None or dist < min_dist[0]:
                min_dist = [
                    dist, 
                    str(particle["outcome"])
                ]
        if str(min_dist[1]) != str(row["outcome"]):
            num_err +=1
        total += 1
    return num_err/total

In [16]:
import sys
import numpy as np
import math
import datetime

class PSO:

    def __init__(self, particles, velocities, fitness_function,
                 w=0.8, c_1=1, c_2=1, max_iter=100, auto_coef=True):
        self.particles = particles
        self.velocities = velocities
        self.fitness_function = fitness_function
        
        self.iter = 0
        time_int = int(round(datetime.datetime.now().timestamp()))
        file_name_part = f"PSO_{time_int}_particles.csv"
        file_name_bests = f"PSO_{time_int}_bests.csv"
        self.iterations_csv = open(file_name_part, "w+")
        self.bests_csv = open(file_name_bests, "w+")
        self.iterations_csv.write("iteration,particleNo,error\n")
        self.bests_csv.write("iteration,error\n")

        self.N = len(self.particles)
        self.w = w
        self.c_1 = c_1
        self.c_2 = c_2
        self.auto_coef = auto_coef
        self.max_iter = max_iter


        self.p_bests = self.particles
        self.p_bests_values = [self.fitness_function(i) for i in self.particles]
        for i in range(len(self.p_bests_values)):
            self.iterations_csv.write(f"{self.iter},{i},{self.p_bests_values[i]}\n")
        self.g_best = self.p_bests[0]
        self.g_best_value = self.p_bests_values[0]
        self.update_bests()

        self.is_running = True
        self.update_coef()

    def close_files():
        self.iterations_csv.close()
        self.bests_csv.close()

    def __str__(self):
        return f'[{self.iter}/{self.max_iter}] $w$:{self.w:.3f} - $c_1$:{self.c_1:.3f} - $c_2$:{self.c_2:.3f}'

    def next(self):
        self.bests_csv.write(f"{self.iter},{self.g_best_value}\n")
        if self.iter > 0:
            self.move_particles()
            self.update_bests()
            self.update_coef()

        self.iter += 1
        self.is_running = self.is_running and self.iter < self.max_iter
        return self.is_running

    def update_coef(self):
        if self.auto_coef:
            t = self.iter
            n = self.max_iter
            self.w = (0.4/n**2) * (t - n) ** 2 + 0.4
            self.c_1 = -3 * t / n + 3.5
            self.c_2 =  3 * t / n + 0.5

    def move_particles(self):

        # add inertia
        new_velocities = self.w * self.velocities
        # add cognitive component
        r_1 = np.random.random(self.N)
        r_1 = np.tile(r_1[:, None], (1, 20))
        for index in range(len(self.particles)):
            for subi in range(len(self.particles[index])):
                for ci in range(len(col2u)):
                    c = col2u[ci]
                    new_velocities[index][ci] += self.c_1 * r_1[index][ci] * (self.p_bests[index][subi][c] - self.particles[index][subi][c])

        # add social component
        r_2 = np.random.random(self.N)
        r_2 = np.tile(r_2[:, None], (1, 20))
        g_best = self.g_best
        for index in range(len(self.particles)):
            for subi in range(len(self.particles[index])):
                for ci in range(len(col2u)):
                    c = col2u[ci]
                    new_velocities[index][ci] += self.c_2 * r_2[index][ci] * (g_best[subi][ci]  - self.particles[index][subi][c])

        self.is_running = np.sum(self.velocities - new_velocities) != 0

        # update positions and velocities
        self.velocities = new_velocities
        for index in range(len(self.particles)):
            for subi in range(len(self.particles[index])):
                for ci in range(len(col2u)):
                    c = col2u[ci]
                    self.particles[index][subi][c] = self.particles[index][subi][c] + new_velocities[index][ci]


    def update_bests(self):
        fits = [self.fitness_function(i) for i in self.particles]

        for i in range(len(self.particles)):
            self.iterations_csv.write(f"{self.iter},{i},{fits[i]}\n")
            # update best personnal value (cognitive)
            if fits[i] < self.p_bests_values[i]:
                self.p_bests_values[i] = fits[i]
                self.p_bests[i] = self.particles[i]
                # update best global value (social)
                if fits[i] < self.g_best_value:
                    self.g_best_value = fits[i]
                    self.g_best = self.particles[i]


def opt_funct(x):
    return fitness_func(x)

num_particles = 16
num_features = len(col2u)

velocities = (np.random.random((num_particles, num_features)) - 0.5) / 10


In [17]:
"""
import datetime
print(datetime.datetime.now())
pso = PSO(start_particles, velocities, opt_funct, max_iter=15, w = .8, c_1=1, c_2=1)
print(datetime.datetime.now())
"""

'\nimport datetime\nprint(datetime.datetime.now())\npso = PSO(start_particles, velocities, opt_funct, max_iter=15, w = .8, c_1=1, c_2=1)\nprint(datetime.datetime.now())\n'

In [18]:
"""
i = 0
while pso.next():
    print("ITER: ", i, " pso: ", pso.p_bests_values[0])
    i += 1
"""

'\ni = 0\nwhile pso.next():\n    print("ITER: ", i, " pso: ", pso.p_bests_values[0])\n    i += 1\n'

In [19]:
    start_particles, not_normal_out, normal_out = get_start_particles(num_particles = 16, not_normal_out = not_normal_out, normal_out = normal_out)

In [20]:
import time

def call_pso(not_normal_out, normal_out):
    num_particles = 16
    num_features = len(col2u)

    velocities = (np.random.random((num_particles, num_features)) - 0.5) / 10
    start_particles, not_normal_out, normal_out = get_start_particles(num_particles = 16, not_normal_out = not_normal_out, normal_out = normal_out)
    start_time = time.time()
    pso = PSO(start_particles, velocities, opt_funct, max_iter=15, w = .8, c_1=1, c_2=1)
    i = 0
    while pso.next():
        i += 1
    end_time = time.time()
    return start_time - end_time

In [21]:
ITERATIONS = 10
"""avg_exec_pso = 0
for _ in range(ITERATIONS):
    avg_exec_pso += call_pso(not_normal_out, normal_out)
avg_exec_pso = avg_exec_pso / ITERATIONS
print("AVERAGE TIME PSO: "+avg_exec_pso)"""

'avg_exec_pso = 0\nfor _ in range(ITERATIONS):\n    avg_exec_pso += call_pso(not_normal_out, normal_out)\navg_exec_pso = avg_exec_pso / ITERATIONS\nprint("AVERAGE TIME PSO: "+avg_exec_pso)'

In [22]:

from numpy.random import randint
from numpy.random import rand
import random
import numpy as np
import matplotlib.pyplot as plot
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib import cm


def selection(pop, scores, k=3):
    selection_ix = randint(len(pop))
    for ix in randint(0, len(pop), k-1):
        if scores[ix] < scores[selection_ix]:
            selection_ix = ix
    return pop[selection_ix]

def crossover(p1, p2):
    mid = len(col2u)/2
    p1_new = []
    p2_new = []
    for i in range(len(p1)):
        p1_new.append([])
        p2_new.append([])
        for ci in range(len(col2u+['outcome'])):
            c = (col2u+['outcome'])[ci]
            if ci < mid:
                p1_new[i].append(p2[i][c])
                p2_new[i].append(p1[i][c])
            else:
                p1_new[i].append(p1[i][c])
                p2_new[i].append(p2[i][c])
    df_tmp1 = pandas.DataFrame(p1_new, columns = col2u+['outcome'])
    df_tmp2 = pandas.DataFrame(p2_new, columns = col2u+['outcome'])
    return [df_tmp1.iloc[0], df_tmp1.iloc[1]], [df_tmp2.iloc[0], df_tmp2.iloc[1]]


def mutation(a, r_mut):
    for i in range(len(a)):
        for c in col2u:
            if rand() < r_mut:
                a[i][c] = random.uniform(0,1)

def genetic_algorithm(fitess_function, particles, n_iter, r_mut):
    time_int = int(round(datetime.datetime.now().timestamp()))
    file_name_part = f"GA_{time_int}_particles.csv"
    file_name_bests = f"GA_{time_int}_bests.csv"
    iterations_csv = open(file_name_part, "w+")
    bests_csv = open(file_name_bests, "w+")
    iterations_csv.write("iteration,particleNo,error\n")
    bests_csv.write("iteration,error\n")
    best, best_eval = 1, fitess_function(particles[0])
    for gen in range(n_iter):
        scores = [fitess_function(c) for c in particles]
        for i in range(len(scores)):
            iterations_csv.write(f"{gen},{i},{scores[i]}\n")
        for i in range(len(scores)):
            if scores[i] < best_eval:
                best, best_eval = particles[i], scores[i]
        bests_csv.write(f"{gen},{best_eval}\n")
        selected = [selection(particles, scores) for _ in range(len(particles))]
        children = list()
        for i in range(0, len(particles), 2):
            p1, p2 = selected[i], selected[i+1]
            for c in crossover(p1, p2):
                mutation(c, r_mut)
                children.append(c)
        particles = children
    iterations_csv.close()
    bests_csv.close()
    return [best, best_eval]


In [23]:
#start_particles, not_normal_out, normal_out = get_start_particles(num_particles = 16, not_normal_out = not_normal_out, normal_out = normal_out)

In [24]:
#genetic_algorithm(fitness_func, start_particles, n_iter, r_mut)

In [31]:
def call_ga(not_normal_out, normal_out):
    num_particles = 16
    
    n_iter = 15
    n_bits = 10
    n_pop = 100
    r_cross = 0.9
    r_mut = 1.0 / float(n_bits)
    
    start_particles, not_normal_out, normal_out = get_start_particles(num_particles = 16, not_normal_out = not_normal_out, normal_out = normal_out)
    start_time = time.time()
    genetic_algorithm(fitness_func, start_particles, n_iter, r_mut)
    end_time = time.time()
    return start_time - end_time

In [None]:
avg_exec_ga = 0
for _ in range(ITERATIONS):
    avg_exec_ga += call_ga(not_normal_out, normal_out)
avg_exec_ga = avg_exec_ga / ITERATIONS
print("AVERAGE TIME GA: "+avg_exec_ga)