KDD cup 1999 network intrusion dataset #1

Makoto YUI edited this page Oct 16, 2013 · 1 revision
Clone this wiki locally

Download kddcup.data.gz, kddcup.data_10_percent.gz, and corrected.gz from KDD Cup 1999 Data.


create database kddcup99;
use kddcup99;

drop table training_raw;
create external table training_raw (
rowid BIGINT,
duration FLOAT,
protocol_type STRING,
service STRING,
flag STRING,
src_bytes FLOAT,
dst_bytes FLOAT,
land TINYINT,
wrong_fragment FLOAT,
urgent FLOAT,
hot FLOAT,
num_failed_logins FLOAT,
logged_in TINYINT,
num_compromised FLOAT,
root_shell FLOAT,
su_attempted FLOAT,
num_root FLOAT,
num_file_creations FLOAT,
num_shells FLOAT,
num_access_files FLOAT,
num_outbound_cmds FLOAT,
is_host_login TINYINT,
is_guest_login TINYINT,
count FLOAT,
srv_count FLOAT,
serror_rate FLOAT,
srv_serror_rate FLOAT,
rerror_rate FLOAT,
srv_rerror_rate FLOAT,
same_srv_rate FLOAT,
diff_srv_rate FLOAT,
srv_diff_host_rate FLOAT,
dst_host_count FLOAT,
dst_host_srv_count FLOAT,
dst_host_same_srv_rate FLOAT,
dst_host_diff_srv_rate FLOAT,
dst_host_same_src_port_rate FLOAT,
dst_host_srv_diff_host_rate FLOAT,
dst_host_serror_rate FLOAT,
dst_host_srv_serror_rate FLOAT,
dst_host_rerror_rate FLOAT,
dst_host_srv_rerror_rate FLOAT,
label STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/dataset/kddcup99/training';

drop table training10p_raw;
create external table training10p_raw (
rowid BIGINT,
duration FLOAT,
protocol_type STRING,
service STRING,
flag STRING,
src_bytes FLOAT,
dst_bytes FLOAT,
land TINYINT,
wrong_fragment FLOAT,
urgent FLOAT,
hot FLOAT,
num_failed_logins FLOAT,
logged_in TINYINT,
num_compromised FLOAT,
root_shell FLOAT,
su_attempted FLOAT,
num_root FLOAT,
num_file_creations FLOAT,
num_shells FLOAT,
num_access_files FLOAT,
num_outbound_cmds FLOAT,
is_host_login TINYINT,
is_guest_login TINYINT,
count FLOAT,
srv_count FLOAT,
serror_rate FLOAT,
srv_serror_rate FLOAT,
rerror_rate FLOAT,
srv_rerror_rate FLOAT,
same_srv_rate FLOAT,
diff_srv_rate FLOAT,
srv_diff_host_rate FLOAT,
dst_host_count FLOAT,
dst_host_srv_count FLOAT,
dst_host_same_srv_rate FLOAT,
dst_host_diff_srv_rate FLOAT,
dst_host_same_src_port_rate FLOAT,
dst_host_srv_diff_host_rate FLOAT,
dst_host_serror_rate FLOAT,
dst_host_srv_serror_rate FLOAT,
dst_host_rerror_rate FLOAT,
dst_host_srv_rerror_rate FLOAT,
label STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/dataset/kddcup99/training10p';

drop table testing_raw;
create external table testing_raw (
rowid BIGINT,
duration FLOAT,
protocol_type STRING,
service STRING,
flag STRING,
src_bytes FLOAT,
dst_bytes FLOAT,
land TINYINT,
wrong_fragment FLOAT,
urgent FLOAT,
hot FLOAT,
num_failed_logins FLOAT,
logged_in TINYINT,
num_compromised FLOAT,
root_shell FLOAT,
su_attempted FLOAT,
num_root FLOAT,
num_file_creations FLOAT,
num_shells FLOAT,
num_access_files FLOAT,
num_outbound_cmds FLOAT,
is_host_login TINYINT,
is_guest_login TINYINT,
count FLOAT,
srv_count FLOAT,
serror_rate FLOAT,
srv_serror_rate FLOAT,
rerror_rate FLOAT,
srv_rerror_rate FLOAT,
same_srv_rate FLOAT,
diff_srv_rate FLOAT,
srv_diff_host_rate FLOAT,
dst_host_count FLOAT,
dst_host_srv_count FLOAT,
dst_host_same_srv_rate FLOAT,
dst_host_diff_srv_rate FLOAT,
dst_host_same_src_port_rate FLOAT,
dst_host_srv_diff_host_rate FLOAT,
dst_host_serror_rate FLOAT,
dst_host_srv_serror_rate FLOAT,
dst_host_rerror_rate FLOAT,
dst_host_srv_rerror_rate FLOAT,
label STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/dataset/kddcup99/testing';
awk 'BEGIN{ FS=","; OFS=","; } { print NR, $0; }' kddcup.data |
hadoop fs -put - /dataset/kddcup99/training/kddcup.data

awk 'BEGIN{ FS=","; OFS=","; } { print NR, $0; }' kddcup.data_10_percent |
hadoop fs -put - /dataset/kddcup99/training10p/kddcup.data_10_percent

awk 'BEGIN{ FS=","; OFS=","; } { print NR, $0; }' corrected |
hadoop fs -put - /dataset/kddcup99/testing/corrected.csv
create or replace view training
as
select
  rowid,
  label,
  array(duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, bias) as features
from (
select 
  rowid,
  concat("1:", duration) as duration,
  concat("2_", protocol_type) as protocol_type,
  concat("3_", service) as service,
  concat("4_", flag) as flag,
  concat("5:", src_bytes) as src_bytes,
  concat("6:", dst_bytes) as dst_bytes,
  concat("7:", land) as land,
  concat("8:", wrong_fragment) as wrong_fragment,
  concat("9:", urgent) as urgent,
  concat("10:", hot) as hot,
  concat("11:", num_failed_logins) as num_failed_logins,
  concat("12:", logged_in) as logged_in,
  concat("13:", num_compromised) as num_compromised,
  concat("14:", root_shell) as root_shell,
  concat("15:", su_attempted) as su_attempted,
  concat("16:", num_root) as num_root,
  concat("17:", num_file_creations) as num_file_creations,
  concat("18:", num_shells) as num_shells,
  concat("19:", num_access_files) as num_access_files,
  concat("20:", num_outbound_cmds) as num_outbound_cmds,
  concat("21:", is_host_login) as is_host_login,
  concat("22:", is_guest_login) as is_guest_login,
  concat("23:", count) as count,
  concat("24:", srv_count) as srv_count,
  concat("25:", serror_rate) as serror_rate,
  concat("26:", srv_serror_rate) as srv_serror_rate,
  concat("27:", rerror_rate) as rerror_rate,
  concat("28:", srv_rerror_rate) as srv_rerror_rate,
  concat("29:", same_srv_rate) as same_srv_rate,
  concat("30:", diff_srv_rate) as diff_srv_rate,
  concat("31:", srv_diff_host_rate) as srv_diff_host_rate,
  concat("32:", dst_host_count) as dst_host_count,
  concat("33:", dst_host_srv_count) as dst_host_srv_count,
  concat("34:", dst_host_same_srv_rate) as dst_host_same_srv_rate,
  concat("35:", dst_host_diff_srv_rate) as dst_host_diff_srv_rate,
  concat("36:", dst_host_same_src_port_rate) as dst_host_same_src_port_rate,
  concat("37:", dst_host_srv_diff_host_rate) as dst_host_srv_diff_host_rate,
  concat("38:", dst_host_serror_rate) as dst_host_serror_rate,
  concat("39:", dst_host_srv_serror_rate) as dst_host_srv_serror_rate,
  concat("40:", dst_host_rerror_rate) as dst_host_rerror_rate,
  concat("41:", dst_host_srv_rerror_rate) as dst_host_srv_rerror_rate,
  -1 as bias,
  substr(label,0,length(label)-1) label
from training_raw
) t;

create or replace view training10p
as
select
  rowid,
  label,
  array(duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, bias) as features
from (
select 
  rowid,
  concat("1:", duration) as duration,
  concat("2_", protocol_type) as protocol_type,
  concat("3_", service) as service,
  concat("4_", flag) as flag,
  concat("5:", src_bytes) as src_bytes,
  concat("6:", dst_bytes) as dst_bytes,
  concat("7:", land) as land,
  concat("8:", wrong_fragment) as wrong_fragment,
  concat("9:", urgent) as urgent,
  concat("10:", hot) as hot,
  concat("11:", num_failed_logins) as num_failed_logins,
  concat("12:", logged_in) as logged_in,
  concat("13:", num_compromised) as num_compromised,
  concat("14:", root_shell) as root_shell,
  concat("15:", su_attempted) as su_attempted,
  concat("16:", num_root) as num_root,
  concat("17:", num_file_creations) as num_file_creations,
  concat("18:", num_shells) as num_shells,
  concat("19:", num_access_files) as num_access_files,
  concat("20:", num_outbound_cmds) as num_outbound_cmds,
  concat("21:", is_host_login) as is_host_login,
  concat("22:", is_guest_login) as is_guest_login,
  concat("23:", count) as count,
  concat("24:", srv_count) as srv_count,
  concat("25:", serror_rate) as serror_rate,
  concat("26:", srv_serror_rate) as srv_serror_rate,
  concat("27:", rerror_rate) as rerror_rate,
  concat("28:", srv_rerror_rate) as srv_rerror_rate,
  concat("29:", same_srv_rate) as same_srv_rate,
  concat("30:", diff_srv_rate) as diff_srv_rate,
  concat("31:", srv_diff_host_rate) as srv_diff_host_rate,
  concat("32:", dst_host_count) as dst_host_count,
  concat("33:", dst_host_srv_count) as dst_host_srv_count,
  concat("34:", dst_host_same_srv_rate) as dst_host_same_srv_rate,
  concat("35:", dst_host_diff_srv_rate) as dst_host_diff_srv_rate,
  concat("36:", dst_host_same_src_port_rate) as dst_host_same_src_port_rate,
  concat("37:", dst_host_srv_diff_host_rate) as dst_host_srv_diff_host_rate,
  concat("38:", dst_host_serror_rate) as dst_host_serror_rate,
  concat("39:", dst_host_srv_serror_rate) as dst_host_srv_serror_rate,
  concat("40:", dst_host_rerror_rate) as dst_host_rerror_rate,
  concat("41:", dst_host_srv_rerror_rate) as dst_host_srv_rerror_rate,
  -1 as bias,
  substr(label,0,length(label)-1) label
from training10p_raw
) t;

create or replace view testing
as
select
  rowid,
  label,
  array(duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, bias) as features
from (
select 
  rowid,
  concat("1:", duration) as duration,
  concat("2_", protocol_type) as protocol_type,
  concat("3_", service) as service,
  concat("4_", flag) as flag,
  concat("5:", src_bytes) as src_bytes,
  concat("6:", dst_bytes) as dst_bytes,
  concat("7:", land) as land,
  concat("8:", wrong_fragment) as wrong_fragment,
  concat("9:", urgent) as urgent,
  concat("10:", hot) as hot,
  concat("11:", num_failed_logins) as num_failed_logins,
  concat("12:", logged_in) as logged_in,
  concat("13:", num_compromised) as num_compromised,
  concat("14:", root_shell) as root_shell,
  concat("15:", su_attempted) as su_attempted,
  concat("16:", num_root) as num_root,
  concat("17:", num_file_creations) as num_file_creations,
  concat("18:", num_shells) as num_shells,
  concat("19:", num_access_files) as num_access_files,
  concat("20:", num_outbound_cmds) as num_outbound_cmds,
  concat("21:", is_host_login) as is_host_login,
  concat("22:", is_guest_login) as is_guest_login,
  concat("23:", count) as count,
  concat("24:", srv_count) as srv_count,
  concat("25:", serror_rate) as serror_rate,
  concat("26:", srv_serror_rate) as srv_serror_rate,
  concat("27:", rerror_rate) as rerror_rate,
  concat("28:", srv_rerror_rate) as srv_rerror_rate,
  concat("29:", same_srv_rate) as same_srv_rate,
  concat("30:", diff_srv_rate) as diff_srv_rate,
  concat("31:", srv_diff_host_rate) as srv_diff_host_rate,
  concat("32:", dst_host_count) as dst_host_count,
  concat("33:", dst_host_srv_count) as dst_host_srv_count,
  concat("34:", dst_host_same_srv_rate) as dst_host_same_srv_rate,
  concat("35:", dst_host_diff_srv_rate) as dst_host_diff_srv_rate,
  concat("36:", dst_host_same_src_port_rate) as dst_host_same_src_port_rate,
  concat("37:", dst_host_srv_diff_host_rate) as dst_host_srv_diff_host_rate,
  concat("38:", dst_host_serror_rate) as dst_host_serror_rate,
  concat("39:", dst_host_srv_serror_rate) as dst_host_srv_serror_rate,
  concat("40:", dst_host_rerror_rate) as dst_host_rerror_rate,
  concat("41:", dst_host_srv_rerror_rate) as dst_host_srv_rerror_rate,
  -1 as bias,
  substr(label,0,length(label)-1) label
from testing_raw
) t;

create table testing_exploded as
select 
  rowid,
  label,
  split(feature,":")[0] as feature,
  cast(COALESCE(split(feature,":")[1],1.0) as float) as value
from 
  testing 
  LATERAL VIEW explode(features) t AS feature;