In [None]:
import numpy as np
from sklearn.svm import SVC
import pickle
from sklearn.metrics import f1_score

def _get_int_feature(dictionary, key, counter):
    if key in dictionary:
        return dictionary[key], counter
    else:           # key not in dictionary
        dictionary[key] = counter
    return dictionary[key], counter+1

In [None]:
def calculate_macro_f1_score(predictions, true_labels):
    true_positives = [0 for i in range(11)]
    false_positives = [0 for i in range(11)]
    false_negatives = [0 for i in range(11)]

    if len(predictions) != len(true_labels):
        print("bug in code, length of predictions should match length of true_labels")
        return None
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            true_positives[predictions[i]] += 1
        else:
            false_positives[predictions[i]] += 1
            false_negatives[true_labels[i]] += 1

    total_classes = 0
    total_f1 = 0
    for i in range(11):
        if true_positives[i]==0 and false_positives[i]==0:
            continue
        elif true_positives[i]==0 and false_negatives[i]==0:
            continue
        prec = true_positives[i]*1.0/(true_positives[i] + false_positives[i])
        recall = true_positives[i]*1.0/(true_positives[i]+false_negatives[i])
        f1=0
        if prec+recall != 0:
            f1 = 2*prec*recall/(prec+recall)
            total_classes += 1
            total_f1 += f1
    return total_f1*100.0/total_classes

def calculate_micro_f1_score(predictions, true_labels):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    if len(predictions) != len(true_labels):
        print("bug in code, length of predictions should match length of true_labels")
        return None
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            true_positives += 1
        else:
            false_positives += 1
            false_negatives += 1
    prec = true_positives*1.0/(true_positives + false_positives)
    recall = true_positives*1.0/(true_positives+false_negatives)
    return 2*prec*recall*100.0/(prec+recall)


In [None]:
dos = ['back','land','neptune','pod','smurf','teardrop']
u2r = ['buffer_overflow','loadmodule','perl','rootkit']
r2l = ['ftp_write','guess_passwd','imap','multihop','phf','spy','warezclient','warezmaster']
probing = ['ipsweep','nmap','portsweep','satan']
normal = ['normal']

ifile = open('../kddcup.data','r')             # loading data
raw_data = ifile.readlines()
ifile.close()

## cleaning ##
cleanedData = []
dict_tcp,tcpCount = {},0
dict_http,httpCount = {},0
dict_sf,sfCount = {},0

nDOS,nU2R,nR2L,nProb,nNormal,nOthers = 0,0,0,0,0,0
for info in raw_data:
    info = info.replace('\n','').replace('.','').split(',')
    info[1], tcpCount = _get_int_feature(dict_tcp, info[1], tcpCount)
    info[2], httpCount = _get_int_feature(dict_http, info[2], httpCount)
    info[3], sfCount = _get_int_feature(dict_sf, info[3], sfCount)
    # print("info is ", info)
    if info[-1] in dos:
        info[-1] = 1 #'DOS' label
        nDOS += 1
#         cleanedData.append(info)
    elif info[-1] in u2r:
        info[-1] = 2 #'U2R'
        nU2R += 1
    elif info[-1] in r2l:
        info[-1] = 3 #'R2L'
        nR2L += 1
    elif info[-1] in probing:
        info[-1] = 4 #'PROBING'
        nProb += 1
    elif info[-1] in normal:           # label is normal
        nNormal += 1
        info[-1] = 0 #'NORMAL' label
        
    else:                               # unspecified label
        nOthers += 1
        continue
    cleanedData.append(info)
# with open('cleaned_data', 'wb') as fp:
#     pickle.dump(cleanedData, fp)


# with open ('cleaned_data', 'rb') as fp:
#     cleanedData = pickle.load(fp)
examples_matrix = np.array(cleanedData)
np.random.shuffle(examples_matrix)

In [None]:
print(nDOS,nU2R,nR2L,nNormal,nOthers)

In [None]:
# print("example is ", examples_matrix[1])
train_size = 70000
test_size = 30000
train_feature_matrix = examples_matrix[:train_size,:-1]
train_label_matrix = examples_matrix[:train_size,-1]
test_feature_matrix = examples_matrix[train_size+1:train_size+test_size,:-1]
test_label_matrix = examples_matrix[train_size+1:train_size+test_size,-1]

In [None]:
import collections
print(collections.Counter(train_label_matrix))
print(collections.Counter(test_label_matrix))

In [None]:
# print(feature_matrix[:10])
# print('labels are ', train_label_matrix[:10])
clf = SVC(gamma='auto')
clf.fit(train_feature_matrix, train_label_matrix)

In [None]:
predicted_labels = clf.predict(test_feature_matrix)
macro_f1_score = f1_score(test_label_matrix, predicted_labels, average='macro') 
print(macro_f1_score)
# macro_f1_score = calculate_macro_f1_score(predicted_labels, test_label_matrix)

In [None]:
# for i, predicted_label in enumerate(predicted_labels):
#     if predicted_label != test_label_matrix[i]:
#         print(predicted_label, " ", test_label_matrix[i], " ", i)
# print("macro f1 score is ", macro_f1_score)
# print("predicted_label ", predicted_label, " true label ", examples_matrix[51,-1])

In [None]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier()
dt_clf = dt_clf.fit(train_feature_matrix, train_label_matrix)
dt_predictions = dt_clf.predict(test_feature_matrix)
dt_macro_f1_score = f1_score(test_label_matrix, dt_predictions, average='macro') 
print(dt_macro_f1_score)

In [None]:
from sklearn.neural_network import MLPClassifier
nn_clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(50, 30), random_state=1)
nn_train_feature_matrix = train_feature_matrix.astype(np.float64)
nn_test_feature_matrix = test_feature_matrix.astype(np.float64)
nn_clf.fit(nn_train_feature_matrix, train_label_matrix)
nn_predictions = nn_clf.predict(nn_test_feature_matrix)
nn_macro_f1_score = f1_score(test_label_matrix, nn_predictions, average='macro') 
print(nn_macro_f1_score)

In [2]:
import re
lists = []
with open("allResults.txt","rb") as fp:
    results = fp.readlines()
    for i in range(10):
        axis = []
        lists.append(axis)
    
    for result in results[1:]:
        result = str(result)
#         print (result.split(','))
        
        result = re.sub('[^0-9.,]*', '', result)
        result = result.split(',')
#         print(result)
        for i, score in enumerate(result):
            if i == 0:
                lists[i].append(int(score)/1000)
            else:
                
                score = (float(score))
                lists[i].append(score)

print (len(lists[0]))
# print (lists[1])
# print (lists[4])
# print (lists[7])
# plot_learning_curves('Dataset Size', 'F1 Score', lists[0], lists[1], lists[4], lists[7], 'F1Scores.png')
# plot_learning_curves('Dataset Size', 'Accuracy', lists[0], lists[2], lists[5], lists[8], 'Accuracy.png')
plot_learning_curves('Dataset Size', 'Time', lists[0], lists[3], lists[6], lists[9], 'Time.png')



# plot_learning_curves('Dataset Size', 'F1 Score', lists[0], lists[3], lists[4], lists[7], '')

30


In [1]:
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
# mpl.rcParams['figure.figsize'] = (20,20)

def plot_learning_curves(x_axis_label, y_axis_label, x_axis, y_axis_1, y_axis_2, y_axis_3, image_name):
#     print (y_axis_1)
    plt.plot(x_axis, y_axis_1, marker='o')
    plt.plot(x_axis, y_axis_2, marker='^')
    plt.plot(x_axis, y_axis_3, marker='v')
    plt.legend(['SVM', 'DecisionTree', 'Deep Neural Network'], loc='best')
    plt.xlabel(x_axis_label + " (in thousands)")
    plt.ylabel(y_axis_label + " (in seconds)" )
    plt.xticks(x_axis,rotation=90,size=8)
    plt.title(y_axis_label + ' versus ' + x_axis_label)
    plt.savefig(image_name,dpi=300)

In [None]:
plot_learning_curves('Dataset Size', 'F1 Score', lists[0], lists[1], lists[4], lists[7], 'F1Scores.png')