In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import normalize, minmax_scale
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

In [2]:
train = pd.read_csv('data/KDDTrain+.txt')
train = train.iloc[:,0:42]
train.columns=['duration','protocol_type','service','flag','src_bytes','dst_bytes','land'
                                        ,'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
                                      'root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files',
                                     'num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate',
                                       'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate',
                                      'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate',
                                     'dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
                                     'dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
                                      'dst_host_srv_rerror_rate','label']


In [3]:
test = pd.read_csv('data/KDDtest+.txt')
test = test.iloc[:,0:42]
test.columns=['duration','protocol_type','service','flag','src_bytes','dst_bytes','land'
                                    ,'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
                                'root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files',
                                     'num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate',
                                     'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate',
                                      'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate',
                                     'dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
                                  'dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
                                      'dst_host_srv_rerror_rate','label']

In [4]:
train['label'].replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1,
                        'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,'ipsweep' : 0,'nmap' : 0,'portsweep' : 0,
                        'satan' : 0,'mscan' : 0,'saint' : 0,'ftp_write': 0,'guess_passwd': 0,'imap': 0,'multihop': 0,'phf': 0,
                        'spy': 0,'warezclient': 0,'warezmaster': 0,'sendmail': 0,'named': 0,'snmpgetattack': 0,'snmpguess': 0,
                         'xlock': 0,'xsnoop': 0,'httptunnel': 0,'buffer_overflow': 0,'loadmodule': 0,'perl': 0,'rootkit': 0,
                         'ps': 0,'sqlattack': 0,'xterm': 0},inplace = True)
train['protocol_type'].replace({ 'udp' : 0, 'tcp' : 1 ,'icmp': 2 },inplace = True)
train['flag'].replace({ 'OTH' : 0, 'REJ' : 1 ,'RSTO': 2 ,'RSTOS0':3 ,'RSTR': 4,'S0':5,
                           'S1':6,'S2':7, 'S3':8, 'SF':9,'SH':10 },inplace = True)
train['service'].replace({ 'aol':0,'auth':1,'bgp':2,'courier':3,'csnet_ns':4,'ctf':5,'daytime':6,'discard':7,'domain':8,
    'domain_u':9,'echo':10,'eco_i':11,'ecr_i':12, 'efs':13,'exec':14,'finger':15,'ftp':16,'ftp_data':17,'gopher':18,'harvest':19,
    'hostnames':20,'http':21,'http_2784':22,'http_443':23,'http_8001':24,'imap4':25,'IRC':26, 'iso_tsap':27,'klogin':28,'kshell':29,
    'ldap':30,'link':31,'login':32,'mtp':33,'name':34,'netbios_dgm':35,'netbios_ns':36,'netbios_ssn':37,'netstat':38,'nnsp':39,
    'nntp':40,'ntp_u':41,'other':42,'pm_dump':43,'pop_2':44,'pop_3':45,'printer':46,'private':47,'red_i':48,'remote_job':49,
    'rje':50,'shell':51,'smtp':52,'sql_net':53,'ssh':54,'sunrpc':55,'supdup':56,'systat':57,'telnet':58,'tftp_u':59,'tim_i':60,
    'time':61,'urh_i':62,'urp_i':63,'uucp':64,'uucp_path':65,'vmnet':66,'whois':67,'X11':68,'Z39_50':69},inplace = True)
train.head(20)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,0,42,9,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
1,0,1,47,5,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
2,0,1,21,9,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
3,0,1,21,9,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,1,47,1,0,0,0,0,0,0,...,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,1
5,0,1,47,5,0,0,0,0,0,0,...,9,0.04,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
6,0,1,47,5,0,0,0,0,0,0,...,15,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0,1
7,0,1,49,5,0,0,0,0,0,0,...,23,0.09,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
8,0,1,47,5,0,0,0,0,0,0,...,13,0.05,0.06,0.0,0.0,1.0,1.0,0.0,0.0,1
9,0,1,47,1,0,0,0,0,0,0,...,12,0.05,0.07,0.0,0.0,0.0,0.0,1.0,1.0,1


In [5]:
test['label'].replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1,
                        'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,'ipsweep' : 0,'nmap' : 0,'portsweep' : 0,
                        'satan' : 0,'mscan' : 0,'saint' : 0,'ftp_write': 0,'guess_passwd': 0,'imap': 0,'multihop': 0,'phf': 0,
                        'spy': 0,'warezclient': 0,'warezmaster': 0,'sendmail': 0,'named': 0,'snmpgetattack': 0,'snmpguess': 0,
                         'xlock': 0,'xsnoop': 0,'httptunnel': 0,'buffer_overflow': 0,'loadmodule': 0,'perl': 0,'rootkit': 0,
                         'ps': 0,'sqlattack': 0,'xterm': 0},inplace = True)
test['protocol_type'].replace({ 'udp' : 0, 'tcp' : 1 ,'icmp': 2 },inplace = True)
test['flag'].replace({ 'OTH' : 0, 'REJ' : 1 ,'RSTO': 2 ,'RSTOS0':3 ,'RSTR': 4,'S0':5,
                           'S1':6,'S2':7, 'S3':8, 'SF':9,'SH':10 },inplace = True)
test['service'].replace({ 'aol':0,'auth':1,'bgp':2,'courier':3,'csnet_ns':4,'ctf':5,'daytime':6,'discard':7,'domain':8,
    'domain_u':9,'echo':10,'eco_i':11,'ecr_i':12, 'efs':13,'exec':14,'finger':15,'ftp':16,'ftp_data':17,'gopher':18,'harvest':19,
    'hostnames':20,'http':21,'http_2784':22,'http_443':23,'http_8001':24,'imap4':25,'IRC':26, 'iso_tsap':27,'klogin':28,'kshell':29,
    'ldap':30,'link':31,'login':32,'mtp':33,'name':34,'netbios_dgm':35,'netbios_ns':36,'netbios_ssn':37,'netstat':38,'nnsp':39,
    'nntp':40,'ntp_u':41,'other':42,'pm_dump':43,'pop_2':44,'pop_3':45,'printer':46,'private':47,'red_i':48,'remote_job':49,
    'rje':50,'shell':51,'smtp':52,'sql_net':53,'ssh':54,'sunrpc':55,'supdup':56,'systat':57,'telnet':58,'tftp_u':59,'tim_i':60,
    'time':61,'urh_i':62,'urp_i':63,'uucp':64,'uucp_path':65,'vmnet':66,'whois':67,'X11':68,'Z39_50':69},inplace = True)
test.head(20)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,1,47,1,0,0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,1
1,2,1,17,9,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,0
2,0,2,11,9,20,0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,0
3,1,1,58,2,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,0
4,0,1,21,9,267,14515,0,0,0,0,...,255,1.0,0.0,0.01,0.03,0.01,0.0,0.0,0.0,0
5,0,1,52,9,1022,387,0,0,0,0,...,28,0.11,0.72,0.0,0.0,0.0,0.0,0.72,0.04,0
6,0,1,58,9,129,174,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.01,0.01,0.02,0.02,0
7,0,1,21,9,327,467,0,0,0,0,...,255,1.0,0.0,0.01,0.03,0.0,0.0,0.0,0.0,0
8,0,1,16,9,26,157,0,0,0,0,...,26,0.5,0.08,0.02,0.0,0.0,0.0,0.0,0.0,0
9,0,1,58,9,0,0,0,0,0,0,...,128,0.5,0.01,0.0,0.0,0.0,0.0,0.66,0.32,0


In [6]:

 
train=pd.DataFrame(minmax_scale(train, feature_range=(0,1)),
            columns=train.columns, index=train.index) 
test=pd.DataFrame(minmax_scale(test, feature_range=(0,1)),
            columns=test.columns, index=test.index) 

x_train = train.iloc[:,0:40]
y_train = train.iloc[:,41]
x_test = test.iloc[:,0:40]
y_test = test.iloc[:,41]

test.head(20)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0.0,0.5,0.676471,0.1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003922,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,3.5e-05,0.5,0.235294,0.9,0.0002066513,0.0,0.0,0.0,0.0,0.0,...,0.337255,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.147059,0.9,3.183413e-07,0.0,0.0,0.0,0.0,0.0,...,0.223529,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,0.0
3,1.7e-05,0.5,0.838235,0.2,0.0,1.1e-05,0.0,0.0,0.0,0.0,...,0.337255,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,0.0
4,0.0,0.5,0.294118,0.9,4.249857e-06,0.010784,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.01,0.03,0.01,0.0,0.0,0.0,0.0
5,0.0,0.5,0.75,0.9,1.626724e-05,0.000288,0.0,0.0,0.0,0.0,...,0.109804,0.11,0.72,0.0,0.0,0.0,0.0,0.72,0.04,0.0
6,0.0,0.5,0.838235,0.9,2.053302e-06,0.000129,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.01,0.01,0.02,0.02,0.0
7,0.0,0.5,0.294118,0.9,5.204881e-06,0.000347,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.01,0.03,0.0,0.0,0.0,0.0,0.0
8,0.0,0.5,0.220588,0.9,4.138437e-07,0.000117,0.0,0.0,0.0,0.0,...,0.101961,0.5,0.08,0.02,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.5,0.838235,0.9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.501961,0.5,0.01,0.0,0.0,0.0,0.0,0.66,0.32,0.0


In [7]:
#decision tree

dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)



# add data

In [8]:
from joblib import dump, load
dump(dt, 'sample.joblib') 



['sample.joblib']