KDD cup 1999 network intrusion dataset #2 (modified)

Makoto YUI edited this page Nov 30, 2013 · 11 revisions
Clone this wiki locally

Get detail about the dataset modification of this example in the following paper [1].

[1] Kenji Yamanishi, Jun-Ichi Takeuchi, Graham Williams, and Peter Milne. "On-line unsupervised outlier detection using finite mixtures with discounting learning algorithms", In Proc. KDD, pp.320-324, 2000.


Stats of the dataset

select count(1) from training_raw;
> 4898431

select count(1) from training_raw where label != "normal.";
> 3925650
-- The original data set has 3,925,651 attacks (80.1%) out of 4,898,431 records!!

select count(1) from training_raw where logged_in = 1;
> 703067

-- #instrusions
select count(1) from training_raw where logged_in = 1 and label != "normal.";
> 3377

select count(1) from testing_raw;
> 311029

select count(1) from testing_raw where label != "normal."
> 250436

-- #instrusions 11031 / 311029 = 0.03546..
-- 3.5466% is intrusion in the test dataset.
select count(1) from testing_raw where logged_in = 1 and label != "normal."
> 11031

Modification to the dataset

create or replace view training_raw_norm
as 
select * from training_raw WHERE label == "normal.";

-- 1,000,000 - 3,377 = 972,781
SELECT count(1) 
FROM training_raw_norm TABLESAMPLE(996623 ROWS)
> 972781

create or replace view training_raw2
as
select * from (
select * from training_raw where logged_in = 1 and label != "normal."
union all
select * from training_raw_norm TABLESAMPLE(996623 ROWS)
) t;

select count(1) from training_raw2;
> 976158
-- 972781 + 3377 = 976158
-- 3377 / 976158 * 100 = 0.346% is intrusion in the training dataset
drop table training2;
create table training2
as
select
  rowid,
  label,
  array(duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, bias) as features
from (
select 
  rowid,
  concat("1:", if(duration = 0, -1.0, log10(duration+0.1))) as duration,
  concat("2_", protocol_type) as protocol_type,
  concat("3_", service) as service,
  concat("4_", flag) as flag,
  concat("5:", if(src_bytes = 0, -1.0, log10(src_bytes+0.1))) as src_bytes,
  concat("6:", if(dst_bytes = 0, -1.0, log10(dst_bytes+0.1))) as dst_bytes,
  concat("7:", land) as land,
  concat("8:", if(wrong_fragment = 0, -1.0, log10(wrong_fragment+0.1))) as wrong_fragment,
  concat("9:", if(urgent = 0, -1.0, log10(urgent+0.1))) as urgent,
  concat("10:", if(hot = 0, -1.0, log10(hot+0.1))) as hot,
  concat("11:", if(num_failed_logins = 0, -1.0, log10(num_failed_logins+0.1))) as num_failed_logins,
  concat("12:", logged_in) as logged_in,
  concat("13:", if(num_compromised = 0, -1.0, log10(num_compromised+0.1))) as num_compromised,
  concat("14:", if(root_shell = 0, -1.0, log10(root_shell+0.1))) as root_shell,
  concat("15:", if(su_attempted = 0, -1.0, log10(su_attempted+0.1))) as su_attempted,
  concat("16:", if(num_root = 0, -1.0, log10(num_root+0.1))) as num_root,
  concat("17:", if(num_file_creations = 0, -1.0, log10(num_file_creations+0.1))) as num_file_creations,
  concat("18:", if(num_shells = 0, -1.0, log10(num_shells+0.1))) as num_shells,
  concat("19:", if(num_access_files = 0, -1.0, log10(num_access_files+0.1))) as num_access_files,
  concat("20:", if(num_outbound_cmds = 0, -1.0, log10(num_outbound_cmds+0.1))) as num_outbound_cmds,
  concat("21:", is_host_login) as is_host_login,
  concat("22:", is_guest_login) as is_guest_login,
  concat("23:", if(count = 0, -1.0, log10(count+0.1))) as count,
  concat("24:", if(srv_count = 0, -1.0, log10(srv_count+0.1))) as srv_count,
  concat("25:", if(serror_rate = 0, -1.0, log10(serror_rate+0.1))) as serror_rate,
  concat("26:", if(srv_serror_rate = 0, -1.0, log10(srv_serror_rate+0.1))) as srv_serror_rate,
  concat("27:", if(rerror_rate = 0, -1.0, log10(rerror_rate+0.1))) as rerror_rate,
  concat("28:", if(srv_rerror_rate = 0, -1.0, log10(srv_rerror_rate+0.1))) as srv_rerror_rate,
  concat("29:", if(same_srv_rate = 0, -1.0, log10(same_srv_rate+0.1))) as same_srv_rate,
  concat("30:", if(diff_srv_rate = 0, -1.0, log10(diff_srv_rate+0.1))) as diff_srv_rate,
  concat("31:", if(srv_diff_host_rate = 0, -1.0, log10(srv_diff_host_rate+0.1))) as srv_diff_host_rate,
  concat("32:", if(dst_host_count = 0, -1.0, log10(dst_host_count+0.1))) as dst_host_count,
  concat("33:", if(dst_host_srv_count = 0, -1.0, log10(dst_host_srv_count+0.1))) as dst_host_srv_count,
  concat("34:", if(dst_host_same_srv_rate = 0, -1.0, log10(dst_host_same_srv_rate+0.1))) as dst_host_same_srv_rate,
  concat("35:", if(dst_host_diff_srv_rate = 0, -1.0, log10(dst_host_diff_srv_rate+0.1))) as dst_host_diff_srv_rate,
  concat("36:", if(dst_host_same_src_port_rate = 0, -1.0, log10(dst_host_same_src_port_rate+0.1))) as dst_host_same_src_port_rate,
  concat("37:", if(dst_host_srv_diff_host_rate = 0, -1.0, log10(dst_host_srv_diff_host_rate+0.1))) as dst_host_srv_diff_host_rate,
  concat("38:", if(dst_host_serror_rate = 0, -1.0, log10(dst_host_serror_rate+0.1))) as dst_host_serror_rate,
  concat("39:", if(dst_host_srv_serror_rate = 0, -1.0, log10(dst_host_srv_serror_rate+0.1))) as dst_host_srv_serror_rate,
  concat("40:", if(dst_host_rerror_rate = 0, -1.0, log10(dst_host_rerror_rate+0.1))) as dst_host_rerror_rate,
  concat("41:", if(dst_host_srv_rerror_rate = 0, -1.0, log10(dst_host_srv_rerror_rate+0.1))) as dst_host_srv_rerror_rate,
  -1 as bias,
  substr(label,0,length(label)-1) as label
from training_raw2
) t
CLUSTER BY CAST(rand(47) * 100 as INT), CAST(rand(49) * 100 as INT), CAST(rand(50) * 100 as INT);

create or replace view testing2
as
select
  rowid,
  label,
  array(duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, bias) as features
from (
select 
  rowid,
  concat("1:", if(duration = 0, -1.0, log10(duration+0.1))) as duration,
  concat("2_", protocol_type) as protocol_type,
  concat("3_", service) as service,
  concat("4_", flag) as flag,
  concat("5:", if(src_bytes = 0, -1.0, log10(src_bytes+0.1))) as src_bytes,
  concat("6:", if(dst_bytes = 0, -1.0, log10(dst_bytes+0.1))) as dst_bytes,
  concat("7:", land) as land,
  concat("8:", if(wrong_fragment = 0, -1.0, log10(wrong_fragment+0.1))) as wrong_fragment,
  concat("9:", if(urgent = 0, -1.0, log10(urgent+0.1))) as urgent,
  concat("10:", if(hot = 0, -1.0, log10(hot+0.1))) as hot,
  concat("11:", if(num_failed_logins = 0, -1.0, log10(num_failed_logins+0.1))) as num_failed_logins,
  concat("12:", logged_in) as logged_in,
  concat("13:", if(num_compromised = 0, -1.0, log10(num_compromised+0.1))) as num_compromised,
  concat("14:", if(root_shell = 0, -1.0, log10(root_shell+0.1))) as root_shell,
  concat("15:", if(su_attempted = 0, -1.0, log10(su_attempted+0.1))) as su_attempted,
  concat("16:", if(num_root = 0, -1.0, log10(num_root+0.1))) as num_root,
  concat("17:", if(num_file_creations = 0, -1.0, log10(num_file_creations+0.1))) as num_file_creations,
  concat("18:", if(num_shells = 0, -1.0, log10(num_shells+0.1))) as num_shells,
  concat("19:", if(num_access_files = 0, -1.0, log10(num_access_files+0.1))) as num_access_files,
  concat("20:", if(num_outbound_cmds = 0, -1.0, log10(num_outbound_cmds+0.1))) as num_outbound_cmds,
  concat("21:", is_host_login) as is_host_login,
  concat("22:", is_guest_login) as is_guest_login,
  concat("23:", if(count = 0, -1.0, log10(count+0.1))) as count,
  concat("24:", if(srv_count = 0, -1.0, log10(srv_count+0.1))) as srv_count,
  concat("25:", if(serror_rate = 0, -1.0, log10(serror_rate+0.1))) as serror_rate,
  concat("26:", if(srv_serror_rate = 0, -1.0, log10(srv_serror_rate+0.1))) as srv_serror_rate,
  concat("27:", if(rerror_rate = 0, -1.0, log10(rerror_rate+0.1))) as rerror_rate,
  concat("28:", if(srv_rerror_rate = 0, -1.0, log10(srv_rerror_rate+0.1))) as srv_rerror_rate,
  concat("29:", if(same_srv_rate = 0, -1.0, log10(same_srv_rate+0.1))) as same_srv_rate,
  concat("30:", if(diff_srv_rate = 0, -1.0, log10(diff_srv_rate+0.1))) as diff_srv_rate,
  concat("31:", if(srv_diff_host_rate = 0, -1.0, log10(srv_diff_host_rate+0.1))) as srv_diff_host_rate,
  concat("32:", if(dst_host_count = 0, -1.0, log10(dst_host_count+0.1))) as dst_host_count,
  concat("33:", if(dst_host_srv_count = 0, -1.0, log10(dst_host_srv_count+0.1))) as dst_host_srv_count,
  concat("34:", if(dst_host_same_srv_rate = 0, -1.0, log10(dst_host_same_srv_rate+0.1))) as dst_host_same_srv_rate,
  concat("35:", if(dst_host_diff_srv_rate = 0, -1.0, log10(dst_host_diff_srv_rate+0.1))) as dst_host_diff_srv_rate,
  concat("36:", if(dst_host_same_src_port_rate = 0, -1.0, log10(dst_host_same_src_port_rate+0.1))) as dst_host_same_src_port_rate,
  concat("37:", if(dst_host_srv_diff_host_rate = 0, -1.0, log10(dst_host_srv_diff_host_rate+0.1))) as dst_host_srv_diff_host_rate,
  concat("38:", if(dst_host_serror_rate = 0, -1.0, log10(dst_host_serror_rate+0.1))) as dst_host_serror_rate,
  concat("39:", if(dst_host_srv_serror_rate = 0, -1.0, log10(dst_host_srv_serror_rate+0.1))) as dst_host_srv_serror_rate,
  concat("40:", if(dst_host_rerror_rate = 0, -1.0, log10(dst_host_rerror_rate+0.1))) as dst_host_rerror_rate,
  concat("41:", if(dst_host_srv_rerror_rate = 0, -1.0, log10(dst_host_srv_rerror_rate+0.1))) as dst_host_srv_rerror_rate,
  -1 as bias,
  substr(label,0,length(label)-1) as label
from testing_raw
) t;

drop table testing_exploded2;
create table testing_exploded2 as
select 
  rowid,
  label,
  split(feature,":")[0] as feature,
  cast(COALESCE(split(feature,":")[1],1.0) as float) as value
from 
  testing2 
  LATERAL VIEW explode(features) t AS feature;