In [1]:
import os
features="duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label"
protocol_types = {"icmp":0,
"tcp":1,
"udp":2}
aggregated_labels = {"back":"dos",
"land":"dos",
"neptune":"dos",
"pod":"dos",
"smurf":"dos",
"teardrop":"dos",
"normal":"normal"}
deleted_labels = ["buffer_overflow",
"ftp_write",
"guess_passwd",
"imap",
"ipsweep",
"loadmodule",
"multihop",
"nmap",
"perl",
"phf",
"portsweep",
"rootkit",
"satan",
"spy",
"warezclient",
"warezmaster"]
service = {"auth":0,
"bgp":1,
"courier":2,
"csnet_ns":3,
"ctf":4,
"daytime":5,
"discard":6,
"domain":7,
"domain_u":8,
"echo":9,
"eco_i":10,
"ecr_i":11,
"efs":12,
"exec":13,
"finger":14,
"ftp":15,
"ftp_data":16,
"gopher":17,
"hostnames":18,
"http":19,
"http_443":20,
"imap4":21,
"IRC":22,
"iso_tsap":23,
"klogin":24,
"kshell":25,
"ldap":26,
"link":27,
"login":28,
"mtp":29,
"name":30,
"netbios_dgm":31,
"netbios_ns":32,
"netbios_ssn":33,
"netstat":34,
"nnsp":35,
"nntp":36,
"ntp_u":37,
"other":38,
"pm_dump":39,
"pop_2":40,
"pop_3":41,
"printer":42,
"private":43,
"red_i":44,
"remote_job":45,
"rje":46,
"shell":47,
"smtp":48,
"sql_net":49,
"ssh":50,
"sunrpc":51,
"supdup":52,
"systat":53,
"telnet":54,
"tftp_u":55,
"time":56,
"tim_i":57,
"urh_i":58,
"urp_i":59,
"uucp":60,
"uucp_path":61,
"vmnet":62,
"whois":63,
"X11":64,
"Z39_50":65}
flags = {"OTH":0,
"REJ":1,
"RSTO":2,
"RSTOS0":3,
"RSTR":4,
"S0":5,
"S1":6,
"S2":7,
"S3":8,
"SF":9,
"SH":10}

deleted_features = [42]
for i in range(10,23):
    deleted_features.append(i)
deleted_features.sort()
deleted_features.reverse()

file = open("D:\\kddcup.data_10_percent\\kddcup_data","r")

new_filename = "preprocessed_kddcup_data"
new_file = open(new_filename,"w")
new_file.write(features+"\n")

for line in file:
    line = line.replace("\n","")
    tokens = line.split(",")
    
    label = tokens[41]
    label = label.replace(".","")
    
    if label not in deleted_labels:
        label = aggregated_labels[label]
        
        tokens[1] = str(protocol_types[tokens[1]])
        tokens[2] = str(service[tokens[2]])
        tokens[3] = str(flags[tokens[3]])
        
        for i in deleted_features:
            tokens.pop(i-1)
          
        tokens.append(label)
        new_file.write(",".join(tokens) + "\n")

file.close()
new_file.close()

print("File created: " + new_filename)

File created: preprocessed_kddcup_data


In [4]:
import random
import math
import time
from multiprocessing.dummy import Pool as ThreadPool


def minkowski(arr1,arr2,p=2):
	if len(arr1) != len(arr2):
		return None
	distance = 0
	for i,j in zip(arr1,arr2):
		distance += math.pow(abs(float(i)-float(j)),p)
	return math.pow(distance,(float(1)/float(p)))

def generate_series(size,percentage):
	train_size = int(size * percentage / 100)
	train_index = []
	for i in range(train_size):
		index = random.randint(0,size)
		while index in train_index:
			index = random.randint(0,size)
		train_index.append(index)
	train_index.sort()
	return train_index

def train_test_split(data,label,percentage=66):
	size = len(data)
	train_index = generate_series(size,percentage)
	train_data = []
	test_data = []
	train_label = []
	test_label = []

	for i in range(size):
		if i in train_index:
			train_data.append(data[i])
			train_label.append(label[i])
		else:
			test_data.append(data[i])
			test_label.append(label[i])

	return train_data,train_label,test_data,test_label

def get_max_vote(d):
	v = list(d.values())
	k = list(d.keys())
	vote = k[v.index(max(v))]
	return vote

def get_voted_label(arr,neighbors):
	labels = {}
	for i in range(neighbors):
		try:
			labels[arr[i][1]]+=1
		except:
			labels[arr[i][1]]=1
	return get_max_vote(labels)

def read_data(filename,delimitter=","):
	data = []
	labels = []
	
	with open("C:\\Users\\adity\\preprocessed_kddcup_data") as f: lines = [line.rstrip('\n') for line in f]
	for line in lines:
		tokens = line.split(delimitter)
		labels.append(tokens[len(tokens)-1])
		data.append(tokens[0:len(tokens)-1])
	f.close()

	return data,labels


class knn:


	def __init__(self,threads=8,neighbors=5):
		self.threads = threads
		self.neighbors = neighbors
		self.ratio = 0


	def fit(self,train_data,train_label):
		self.train_data = train_data
		self.train_label = train_label


	def predict(self,test_sample):
		distances = []
		for train_d,train_l in zip(self.train_data,self.train_label):
			distances.append((minkowski(train_d,test_sample),train_l))
		distances.sort()
		voted_label = get_voted_label(distances,self.neighbors)
		return voted_label


	def test_one(self,test_sample,test_label):
		if self.predict(test_sample) == test_label:
			self.ratio += 1
			return True
		else:
			return False


	def test_some(self,test_data,test_label,test_size):
		ratio = 0
		for i in range(test_size):
			if self.test_one(test_data[i],test_label[i]):
				ratio += 1
		return float(ratio) / float(test_size)


	def test_all(self,test_data,test_label):
		size = len(test_data)
		return self.test_some(test_data,test_label,size)

data,label = read_data("preoprocessed_kdd_data")
print("File read successfully")
train_x, train_y, test_x, test_y = train_test_split(data,label)
print("Train and test sets created")
knn = knn()
knn.fit(train_x,train_y)
print("Model trained")
start = time.time()
print(knn.test_some(test_x,test_y,500))
print("Model tested in " + str(time.time() - start))
print(knn.ratio)

File read successfully
Train and test sets created
Model trained


ValueError: could not convert string to float: 'duration'

In [9]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

def get_features(data):
	features = []
	for key in data.keys():
		features.append(key)
	features.remove("label")
	return features

def feature_selection(data):	
	from sklearn.feature_selection import SelectKBest
	from sklearn.feature_selection import chi2
	from sklearn.feature_selection import f_classif

	features = get_features(data)
	X = data[features]
	y = data["label"]

	selector = SelectKBest(score_func=chi2,k=5)
	selector.fit(X,y)
	indexes_selected = selector.get_support(indices=True)
	
	selected_features = []
	for i in indexes_selected:
		selected_features.append(features[i])

	print("[+] Selected features -> " + bcolors.OKBLUE + str(selected_features) + bcolors.ENDC)
	return data[selected_features]


def with_feature_selection(data):
	print(bcolors.HEADER + bcolors.UNDERLINE + "Testing with selected features" + bcolors.ENDC)
	
	features = get_features(data)
	X = feature_selection(data)
	y = data["label"]
	
	X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)
	
	knn = KNeighborsClassifier(n_neighbors = 5)
	
	start = time.time()
	knn.fit(X_train,y_train)

	print("[+] Classifier trained in " + bcolors.OKGREEN +  str(time.time() - start)   + bcolors.ENDC)
	
	start = time.time()
	score = knn.score(X_test,y_test)
	
	print("[+] Model Evaluated in " + bcolors.OKGREEN + str(time.time()-start)   + bcolors.ENDC)
	print("[!] Test score is " + bcolors.OKBLUE + str(score)   + bcolors.ENDC)
	print(bcolors.BOLD + bcolors.WARNING + "-------------------------------------------------" + bcolors.ENDC)

def with_full_features(data):
	print(bcolors.HEADER + bcolors.UNDERLINE + "Testing with full data" + bcolors.ENDC)
	
	features = get_features(data)
	X = data[features]
	y = data["label"]
	
	X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)
	
	knn = KNeighborsClassifier(n_neighbors = 5)
	
	start = time.time()
	knn.fit(X_train,y_train)

	print("[+] Classifier trained in " + bcolors.OKGREEN + str(time.time() - start)   + bcolors.ENDC)
	
	start = time.time()
	score = knn.score(X_test,y_test)
	
	print("[+] Model Evaluated in " + bcolors.OKGREEN + str(time.time()-start)   + bcolors.ENDC)
	print("[!] Test score is " + bcolors.OKBLUE + str(score)   + bcolors.ENDC)
	print(bcolors.BOLD + bcolors.WARNING + "-------------------------------------------------" + bcolors.ENDC)

def test_with_real(data,test_filename):
	
	#X = feature_selection(data)
	X = data[get_features(data)]
	y = data["label"]
	knn = KNeighborsClassifier(n_neighbors = 5)
	knn.fit(X,y)

	test_file = open(test_filename,"r")
	lines = []
	for line in test_file:
		line = line.replace("\n","")
		tokens = line.split(",")
		lines.append(tokens)
	test_file.close()

	print(len(lines))
	test_results = []
	for i in range(len(lines)):
		print(i)
		print(lines[i])
		test_results.append(knn.predict(lines[i]))

	count_dict = {}
	for i in test_results:
		try:
			count_dict[i]+=1
		except:
			count_dict[i]=1
	
	print(count_dict)

def main():
	filename="preprocessed_kddcup_data"
	data = pd.read_csv(filename)

	with_full_features(data)
	with_feature_selection(data)
	#test_with_real(data,"preprocessed_dos")


main()

[95m[4mTesting with full data[0m
[+] Classifier trained in [92m713.4444539546967[0m
[+] Model Evaluated in [92m683.712141752243[0m
[!] Test score is [94m0.9997626530478623[0m
[1m[93m-------------------------------------------------[0m
[95m[4mTesting with selected features[0m
[+] Selected features -> [94m['duration', 'dst_bytes', 'count', 'srv_count', 'dst_host_count'][0m
[+] Classifier trained in [92m243.20500445365906[0m
[+] Model Evaluated in [92m234.2224133014679[0m
[!] Test score is [94m0.9980193806062987[0m
[1m[93m-------------------------------------------------[0m
