In [30]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

In [31]:
kdd = pd.read_csv("data/kddcup_data_10_percent")

Here we have to clean the Data for the K Neighbor algorithm. The Algorithm cannot deal with Strings so give each String a distince value.
Create a Dictionary for each String.

In [32]:
protocol_types = pd.unique(kdd.protocol_type.ravel())
protocol_dict = dict(zip(protocol_types,range(protocol_types.size)))

service_types = pd.unique(kdd.service.ravel())
service_dict = dict(zip(service_types,range(service_types.size)))

flag_types = pd.unique(kdd.flag.ravel())
flag_dict = dict(zip(flag_types,range(flag_types.size)))
classification_dict = {"back.": "dos","land.":"dos","neptune.": "dos", "pod.": "dos", "smurf.":"dos","teardrop.":"dos","loadmodule.":"u2r","buffer_overflow.":"u2r","perl.":"u2r","rootkit.":"u2r","ftp_write.":"r2l","guess_passwd.":"r2l","imap.":"r2l","multihop.":"r2l","phf.":"r2l","spy.":"r2l","warezclient.":"r2l","warezmaster.":"r2l","ipsweep.":"probe","nmap.":"probe","portsweep.":"probe","satan.":"probe"}
classification_dict_test = {"back.": "dos","land.":"dos","neptune.": "dos", "pod.": "dos", "smurf.":"dos","teardrop.":"dos","loadmodule.":"u2r","buffer_overflow.":"u2r","perl.":"u2r","rootkit.":"u2r","ftp_write.":"r2l","guess_passwd.":"r2l","imap.":"r2l","multihop.":"r2l","phf.":"r2l","spy.":"r2l","warezclient.":"r2l","warezmaster.":"r2l","ipsweep.":"probe","nmap.":"probe","portsweep.":"probe","satan.":"probe","snmpgetattack.":"dos","apache2.":"dos","udpstorm.":"dos","processtable.":"dos","mailbomb.":"dos","xterm.":"u2r","ps.":"u2r","sqlattack.":"u2r","named.":"r2l","xlock.":"r2l","xsnoop.":"r2l","sendmail.":"r2l","httptunnel.":"r2l","worm.":"r2l","saint.":"probe","mscan.":"probe","snmpguess.":"probe"}



A quick way to replace values with a Dataframe is to use the replace method with a dictionary. It finds the key in that column and replaces it with the dictionary value.

In [33]:
kdd["protocol_type"].replace(protocol_dict, inplace=True)
kdd["service"].replace(service_dict, inplace=True)
kdd["flag"].replace(flag_dict, inplace=True)
kdd["classification"].replace(classification_dict, inplace=True)

Read in the Test Data and do the same code cleanup

In [34]:
kdd_test = pd.read_csv("data/kddcup_test_data")

In [35]:

kdd_test["protocol_type"].replace(protocol_dict, inplace=True)
kdd_test["service"].replace(service_dict, inplace=True)
kdd_test["flag"].replace(flag_dict, inplace=True)
kdd_test["classification"].replace(classification_dict_test, inplace=True)
kdd_test["service"].replace({"icmp":65}, inplace=True)

Create the Training and Testing Data and it's associated category

In [55]:
values = kdd.values
kdd_X_train = values[:,:41]
kdd_y_train = values[:,41]

values_test = kdd_test.values
kdd_X_test = values_test[:,:41]
kdd_y_test = values_test[:,41]





Using the sklearn package is very simple. Simply create an instance of K Nearest Neighbors and then fit the data based on the training set and its classifications. Here I am using the default parameters that comes with the KNeighborsClassifier class. It chooses the algorithm, leaf size as 30 a minkowski metric for determining distance, and uses the 5 nearest neighbors to choose.

Then predict the test data to see how well it did.

In [42]:
knn = KNeighborsClassifier()

In [43]:
knn.fit(kdd_X_train, kdd_y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [56]:
kdd_predict = knn.predict(kdd_X_test)

In [59]:
#normal,probe,dos,u2r,r2l
normal = [0,0,0,0,0]
probe = [0,0,0,0,0]
dos = [0,0,0,0,0]
u2r = [0,0,0,0,0]
r2l = [0,0,0,0,0]
results_table = [normal,probe,dos,u2r,r2l]
results_table

[[0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0]]

In [66]:
result_hash = {"normal.":0,"probe":1,"dos":2,"u2r":3,"r2l":4}
for x in range(kdd_y_test.size):
    results_table[result_hash[kdd_y_test[x]]][result_hash[kdd_predict[x]]]+=1

In [75]:
results_table = np.array(results_table)
results_table

array([[ 60260,    133,    187,      1,     12],
       [  3157,   2867,    544,      0,      4],
       [ 14290,    111, 223192,      0,      1],
       [    57,      0,      0,     13,      0],
       [  5559,    534,     98,      0,      9]])

<html>
<head>
<title>K Nearest Neighbors Confusion Matrix</title>
</head>
<body>
<p>K Nearest Neighbors Confusion Matrix</p>
<table border="1">
<tr>
<td><p>Predicted</p><p> Actual</p></td>
<td>Normal</td>
<td>Probe</td>
<td>DOS</td>
<td>U2R</td>
<td>R2L</td>
<td>Correct %</td>
</tr>
<tr>
<td>normal</td>
<td>60260</td>
<td>133</td>
<td>187</td>
<td>1</td>
<td>12</td>
<td>99.4%</td>
</tr>
<tr>
<td>probe</td>
<td>3157</td>
<td>2867</td>
<td>544</td>
<td>0</td>
<td>4</td>
<td>43.6%</td>

</tr>
<tr>
<td>DOS</td>
<td>14290</td>
<td>111</td>
<td>223192</td>
<td>0</td>
<td>1</td>
<td>93.9%</td>
</tr>
<tr>
<td>U2r</td>
<td>57</td>
<td>0</td>
<td>0</td>
<td>13</td>
<td>0</td>
<td>18.5%</td>
</tr>
<tr>
<td>R2L</td>
<td>5559</td>
<td>534</td>
<td>98</td>
<td>0</td>
<td>9</td>
<td>0.1%</td>
</tr>
<tr>
<td>Correct %</td>
<td>72.3%</td>
<td>78.6%</td>
<td>99.6%</td>
<td>92.8%</td>
<td>34.6%</td>
<td></td>
</tr>
</table>
</body>
</html>


<html>
<head>
<title>HTML Tables</title>
</head>
<body>
<p>Association Confusion Matrix </p>
<table border="1">
<tr>
<td><p>Predicted</p><p> Actual</p></td>
<td>Normal</td>
<td>Probe</td>
<td>DOS</td>
<td>U2R</td>
<td>R2L</td>
<td>Correct %</td>
</tr>
<tr>
<td>normal</td>
<td>52681</td>
<td>6089</td>
<td>660</td>
<td>593</td>
<td>570</td>
<td>86.9%</td>
</tr>
<tr>
<td>probe</td>
<td>1495</td>
<td>4667</td>
<td>4</td>
<td>2</td>
<td>404</td>
<td>71.0%</td>

</tr>
<tr>
<td>DOS</td>
<td>14330</td>
<td>59026</td>
<td>163968</td>
<td>171</td>
<td>99</td>
<td>69.0%</td>
</tr>
<tr>
<td>U2r</td>
<td>1</td>
<td>11</td>
<td>0</td>
<td>52</td>
<td>6</td>
<td>74.2%</td>
</tr>
<tr>
<td>R2L</td>
<td>3809</td>
<td>622</td>
<td>0</td>
<td>803</td>
<td>966</td>
<td>15.5%</td>
</tr>
<tr>
<td>Correct %</td>
<td>72.8%</td>
<td>6.6%</td>
<td>99.5%</td>
<td>3.2%</td>
<td>47.2%</td>
<td></td>
</tr>
</table>
</body>
</html>

  <html>
<head>
<title>HTML Tables</title>
</head>
<body>
<p>Winners of the KDD-99 Cup Confusion Matrix </p>
<table border="1">
<tr>
<td><p>Predicted</p><p> Actual</p></td>
<td>Normal</td>
<td>Probe</td>
<td>DOS</td>
<td>U2R</td>
<td>R2L</td>
<td>Correct %</td>
</tr>
<tr>
<td>normal</td>
<td>60262</td>
<td>243</td>
<td>78</td>
<td>4</td>
<td>6</td>
<td>99.5%</td>
</tr>
<tr>
<td>probe</td>
<td>511</td>
<td>3471</td>
<td>184</td>
<td>0</td>
<td>0</td>
<td>83.3%</td>

</tr>
<tr>
<td>DOS</td>
<td>5299</td>
<td>1328</td>
<td>223226</td>
<td>0</td>
<td>0</td>
<td>97.1%</td>
</tr>
<tr>
<td>U2r</td>
<td>168</td>
<td>20</td>
<td>0</td>
<td>30</td>
<td>10</td>
<td>13.2%</td>
</tr>
<tr>
<td>R2L</td>
<td>14527</td>
<td>294</td>
<td>0</td>
<td>8</td>
<td>1360</td>
<td>8.4%</td>
</tr>
<tr>
<td>Correct %</td>
<td>74.6%</td>
<td>64.8% %</td>
<td>99.9%</td>
<td>71.4%</td>
<td>98.8%</td>
<td></td>
</tr>
</table>
</body>
</html>    
