In [1]:
import numpy as np
from sklearn.svm import SVC
import pickle
from sklearn.metrics import f1_score

def _get_int_feature(dictionary, key, counter):
    if key in dictionary:
        return dictionary[key], counter
    else:           # key not in dictionary
        dictionary[key] = counter
    return dictionary[key], counter+1

In [2]:
def calculate_macro_f1_score(predictions, true_labels):
    true_positives = [0 for i in range(11)]
    false_positives = [0 for i in range(11)]
    false_negatives = [0 for i in range(11)]

    if len(predictions) != len(true_labels):
        print("bug in code, length of predictions should match length of true_labels")
        return None
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            true_positives[predictions[i]] += 1
        else:
            false_positives[predictions[i]] += 1
            false_negatives[true_labels[i]] += 1

    total_classes = 0
    total_f1 = 0
    for i in range(11):
        if true_positives[i]==0 and false_positives[i]==0:
            continue
        elif true_positives[i]==0 and false_negatives[i]==0:
            continue
        prec = true_positives[i]*1.0/(true_positives[i] + false_positives[i])
        recall = true_positives[i]*1.0/(true_positives[i]+false_negatives[i])
        f1=0
        if prec+recall != 0:
            f1 = 2*prec*recall/(prec+recall)
            total_classes += 1
            total_f1 += f1
    return total_f1*100.0/total_classes

def calculate_micro_f1_score(predictions, true_labels):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    if len(predictions) != len(true_labels):
        print("bug in code, length of predictions should match length of true_labels")
        return None
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            true_positives += 1
        else:
            false_positives += 1
            false_negatives += 1
    prec = true_positives*1.0/(true_positives + false_positives)
    recall = true_positives*1.0/(true_positives+false_negatives)
    return 2*prec*recall*100.0/(prec+recall)


In [9]:
dos = ['back','land','neptune','pod','smurf','teardrop']
u2r = ['buffer_overflow','loadmodule','perl','rootkit']
r2l = ['ftp_write','guess_passwd','imap','multihop','phf','spy','warezclient','warezmaster']
probing = ['ipsweep','nmap','portsweep','satan']
normal = ['normal']

ifile = open('../kddcup.data','r')             # loading data
raw_data = ifile.readlines()
ifile.close()

## cleaning ##
cleanedData = []
dict_tcp,tcpCount = {},0
dict_http,httpCount = {},0
dict_sf,sfCount = {},0

nDOS,nU2R,nR2L,nProb,nNormal,nOthers = 0,0,0,0,0,0
for info in raw_data:
    info = info.replace('\n','').replace('.','').split(',')
    info[1], tcpCount = _get_int_feature(dict_tcp, info[1], tcpCount)
    info[2], httpCount = _get_int_feature(dict_http, info[2], httpCount)
    info[3], sfCount = _get_int_feature(dict_sf, info[3], sfCount)
    # print("info is ", info)
    if info[-1] in dos:
        info[-1] = 1 #'DOS' label
        nDOS += 1
#         cleanedData.append(info)
    elif info[-1] in u2r:
        info[-1] = 2 #'U2R'
        nU2R += 1
    elif info[-1] in r2l:
        info[-1] = 3 #'R2L'
        nR2L += 1
    elif info[-1] in probing:
        info[-1] = 4 #'PROBING'
        nProb += 1
    elif info[-1] in normal:           # label is normal
        nNormal += 1
        info[-1] = 0 #'NORMAL' label
        
    else:                               # unspecified label
        nOthers += 1
        continue
    cleanedData.append(info)
# with open('cleaned_data', 'wb') as fp:
#     pickle.dump(cleanedData, fp)


# with open ('cleaned_data', 'rb') as fp:
#     cleanedData = pickle.load(fp)
examples_matrix = np.array(cleanedData)
np.random.shuffle(examples_matrix)

In [10]:
print(nDOS,nU2R,nR2L,nNormal,nOthers)

3883370 52 1126 972781 0


In [11]:
# print("example is ", examples_matrix[1])
train_size = 70000
test_size = 30000
train_feature_matrix = examples_matrix[:train_size,:-1]
train_label_matrix = examples_matrix[:train_size,-1]
test_feature_matrix = examples_matrix[train_size+1:train_size+test_size,:-1]
test_label_matrix = examples_matrix[train_size+1:train_size+test_size,-1]

In [12]:
import collections
print(collections.Counter(train_label_matrix))
print(collections.Counter(test_label_matrix))

Counter({'1': 55612, '0': 13768, '4': 603, '3': 16, '2': 1})
Counter({'1': 23805, '0': 5919, '4': 267, '3': 8})


In [13]:
# print(feature_matrix[:10])
# print('labels are ', train_label_matrix[:10])
clf = SVC(gamma='auto')
clf.fit(train_feature_matrix, train_label_matrix)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
predicted_labels = clf.predict(test_feature_matrix)
macro_f1_score = f1_score(test_label_matrix, predicted_labels, average='macro') 
print(macro_f1_score)
# macro_f1_score = calculate_macro_f1_score(predicted_labels, test_label_matrix)

0.7429527248759489


In [16]:
# for i, predicted_label in enumerate(predicted_labels):
#     if predicted_label != test_label_matrix[i]:
#         print(predicted_label, " ", test_label_matrix[i], " ", i)
# print("macro f1 score is ", macro_f1_score)
# print("predicted_label ", predicted_label, " true label ", examples_matrix[51,-1])

0   4   94
0   1   314
0   4   323
0   1   395
0   1   398
0   1   415
0   1   433
0   1   714
0   4   826
0   4   851
0   1   999
0   4   1124
0   1   1287
0   1   1374
0   1   1507
0   1   1815
0   4   1840
0   1   1952
0   1   2212
0   1   2312
0   4   2658
0   1   2703
0   1   2805
0   1   2998
0   4   3106
0   1   3225
0   4   3437
0   4   3465
0   1   3612
0   4   3674
0   1   3695
0   1   3733
0   1   3768
0   4   3825
0   4   3964
0   1   4136
0   4   4239
0   4   4468
0   1   4477
0   1   4648
0   1   4877
0   4   5121
0   4   5144
0   1   5204
0   1   5407
0   4   5435
0   4   5458
0   3   5469
0   4   5532
0   1   5696
0   4   5796
0   1   5833
0   1   5854
0   4   5896
0   4   6009
0   1   6197
0   4   6527
0   1   6553
0   1   6625
0   1   6637
0   4   6760
0   1   6829
0   4   6965
0   4   7062
0   4   7302
0   1   7481
0   1   7619
0   4   7632
0   4   7683
0   1   7739
0   1   7942
0   1   8018
0   1   8036
0   4   8056
0   1   8071
0   1   8178
0   1   8234
0   4   826

In [19]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier()
dt_clf = dt_clf.fit(train_feature_matrix, train_label_matrix)
dt_predictions = dt_clf.predict(test_feature_matrix)
dt_macro_f1_score = f1_score(test_label_matrix, dt_predictions, average='macro') 
print(dt_macro_f1_score)

0.9647091703507833


In [40]:
from sklearn.neural_network import MLPClassifier
nn_clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(50, 30), random_state=1)
nn_train_feature_matrix = train_feature_matrix.astype(np.float64)
nn_test_feature_matrix = test_feature_matrix.astype(np.float64)
nn_clf.fit(nn_train_feature_matrix, train_label_matrix)
nn_predictions = nn_clf.predict(nn_test_feature_matrix)
nn_macro_f1_score = f1_score(test_label_matrix, nn_predictions, average='macro') 
print(nn_macro_f1_score)

0.9593788421791793
