## 도메인 지식(컬럼 설명)

- su_attempted : discrete
    - 1 if ``su root'' command attempted; 0 otherwise
        - su root : (리눅스) root 권한을 주는 명령어

- num_root : continuous
    - number of ``root'' accesses

- num_file_creations : continuous
    - number of file creation operations

- num_shells : continuous
    - number of shell prompts
        - shell: (리눅스) 운영체제의 커널과 사용자 사이의 다리 역할
        - shell prompt : 명령어 입력창

- num_access_files : continuous
    - number of operations on access control files

- num_outbound_cmds : continuous
    - number of outbound commands in an ftp session
        - FTP - 파일을 교환하기 위한 표준 프로토콜(원하는 파일을 전송)

- is_host_login : discrete
    - 1 if the login belongs to the "host" list; 0 otherwise

- is_guest_login : discrete
    - 1 if the login is a "guest" login; 0 otherwise



- count : continuous
    - number of connections to the same host as the current connection in the past two seconds

- srv_count :  - continuous
    - number of connections to the same service as the current connection in the past two seconds

- serror_rate : continuous
    - % of connections that have "SYN" errors
        - SYN error - syntax error(구문 오류)

- srv_serror_rate : continuous
    - % of connections that have "SYN" errors

- rerror_rate : continuous
    - % of connections that have ``REJ'' errors
        - REJ error - Rejection error

- srv_rerror_rate : continuous
    - % of connections that have ``REJ'' errors

- same_srv_rate : continuous
    - % of connections to the same service


## 자료 살펴보기

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [34]:
train = pd.read_csv('Train_data.csv')
test = pd.read_csv('Test_data.csv')

In [4]:
train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,0,tcp,exec,RSTO,0,0,0,0,0,0,...,7,0.03,0.06,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25188,0,tcp,ftp_data,SF,334,0,0,0,0,0,...,39,1.00,0.00,1.00,0.18,0.00,0.00,0.00,0.00,anomaly
25189,0,tcp,private,REJ,0,0,0,0,0,0,...,13,0.05,0.07,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25190,0,tcp,nnsp,S0,0,0,0,0,0,0,...,20,0.08,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


In [5]:
train.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class'],
      dtype='object')

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25192 entries, 0 to 25191
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     25192 non-null  int64  
 1   protocol_type                25192 non-null  object 
 2   service                      25192 non-null  object 
 3   flag                         25192 non-null  object 
 4   src_bytes                    25192 non-null  int64  
 5   dst_bytes                    25192 non-null  int64  
 6   land                         25192 non-null  int64  
 7   wrong_fragment               25192 non-null  int64  
 8   urgent                       25192 non-null  int64  
 9   hot                          25192 non-null  int64  
 10  num_failed_logins            25192 non-null  int64  
 11  logged_in                    25192 non-null  int64  
 12  num_compromised              25192 non-null  int64  
 13  root_shell      

In [8]:
train.shape

(25192, 42)

In [10]:
train['duration'].value_counts()

0        23168
1          374
2          165
3          102
4           75
         ...  
1721         1
7850         1
13967        1
1609         1
6141         1
Name: duration, Length: 758, dtype: int64

In [13]:
train['protocol_type'].value_counts()

tcp     20526
udp      3011
icmp     1655
Name: protocol_type, dtype: int64

In [14]:
train['service'].value_counts()

http         8003
private      4351
domain_u     1820
smtp         1449
ftp_data     1396
             ... 
urh_i           4
red_i           3
pm_dump         3
tim_i           2
http_8001       1
Name: service, Length: 66, dtype: int64

In [15]:
train['flag'].value_counts()

SF        14973
S0         7009
REJ        2216
RSTR        497
RSTO        304
S1           88
SH           43
S2           21
RSTOS0       21
S3           15
OTH           5
Name: flag, dtype: int64

In [16]:
train['src_bytes'].value_counts()

0       9866
8        738
1        480
44       467
45       416
        ... 
3414       1
1303       1
1287       1
3334       1
1983       1
Name: src_bytes, Length: 1665, dtype: int64

In [17]:
train['class'].value_counts()

normal     13449
anomaly    11743
Name: class, dtype: int64

In [None]:
train['']

## 데이터 전처리

### Encoding
- LabelEncoding
- OneHotEncoding

- 이번 자료에는 컬럼이 많기 때문에 차원을 늘리는 OneHotEncoding보단 LabelEncoding을 사용

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

enc_dict = {}

def encoding_label(x):
    le = LabelEncoder()
    r = le.fit_transform(x)  #1차원
    enc_dict[x.name] = le.classes_  #x.name: 컬럼명
    return r

In [46]:
train_enc = train[train.columns].apply(encoding_label)
train_enc

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,1,19,9,414,0,0,0,0,0,...,150,25,17,3,17,0,0,0,5,0
1,0,2,41,9,126,0,0,0,0,0,...,255,1,0,60,88,0,0,0,0,0
2,0,1,46,5,0,0,0,0,0,0,...,255,26,10,5,0,0,99,87,0,0
3,0,1,22,9,212,2892,0,0,0,0,...,30,255,100,0,3,4,3,1,0,1
4,0,1,22,9,179,362,0,0,0,0,...,255,255,100,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,0,1,16,2,0,0,0,0,0,0,...,255,7,3,6,0,0,0,0,100,99
25188,0,1,19,9,314,0,0,0,0,0,...,1,39,100,0,100,18,0,0,0,0
25189,0,1,46,1,0,0,0,0,0,0,...,255,13,5,7,0,0,0,0,100,99
25190,0,1,38,5,0,0,0,0,0,0,...,255,20,8,6,0,0,99,87,0,0


In [42]:
train.corr()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
duration,1.0,0.084864,0.013258,-0.001012,-0.010358,-0.000486,0.004202,0.011108,-0.063703,0.095215,...,0.055174,-0.11253,-0.119321,0.263489,0.24097,-0.025485,-0.066513,-0.06624,0.18707,0.208435
src_bytes,0.084864,1.0,0.003611,-9e-05,-0.000916,-6.2e-05,0.000995,-0.00026,-0.00204,-0.000196,...,-0.009764,-0.00852,-0.006776,0.001026,0.002316,-0.001238,-0.006346,-0.006227,-0.00213,0.00619
dst_bytes,0.013258,0.003611,1.0,-0.00035,-0.003586,0.000345,0.002539,0.005197,0.012704,0.035852,...,-0.03093,-0.00098,0.022392,-0.012971,0.024078,-0.006006,-0.015584,-0.014543,-0.014094,-0.012803
land,-0.001012,-9e-05,-0.00035,1.0,-0.000813,-5.6e-05,-0.000819,-0.000234,-0.007196,-0.000195,...,-0.01634,-0.008743,0.009531,-0.003929,0.024635,0.053037,0.014291,0.005596,-0.003432,-0.003335
wrong_fragment,-0.010358,-0.000916,-0.003586,-0.000813,1.0,-0.000575,-0.008386,-0.002392,-0.073674,-0.001995,...,0.04002,-0.047256,-0.051845,0.053177,0.03467,-0.020174,-0.053786,-0.05723,0.027718,-0.034143
urgent,-0.000486,-6.2e-05,0.000345,-5.6e-05,-0.000575,1.0,0.002346,-0.000165,0.007801,0.002886,...,0.004612,-0.006324,-0.007014,-0.002105,-0.003013,-0.001815,-0.004044,-0.003953,-0.002427,-0.002358
hot,0.004202,0.000995,0.002539,-0.000819,-0.008386,0.002346,1.0,0.004893,0.113115,0.002005,...,-0.009083,-0.048495,-0.033442,-0.007962,-0.03064,-0.02117,-0.055628,-0.056255,-0.027628,-0.029719
num_failed_logins,0.011108,-0.00026,0.005197,-0.000234,-0.002392,-0.000165,0.004893,1.0,-0.006872,0.015787,...,-0.02646,-0.022315,-0.003336,0.000298,-0.00891,-0.007551,-0.009527,-0.010238,0.016791,0.018693
logged_in,-0.063703,-0.00204,0.012704,-0.007196,-0.073674,0.007801,0.113115,-0.006872,1.0,0.027083,...,-0.395905,0.624839,0.607379,-0.25582,-0.159278,-0.05739,-0.49045,-0.492446,-0.27443,-0.270507
num_compromised,0.095215,-0.000196,0.035852,-0.000195,-0.001995,0.002886,0.002005,0.015787,0.027083,1.0,...,-0.015637,-0.01822,-0.011794,0.001757,-0.004146,0.013608,-0.007911,-0.007716,-0.007199,-0.004361


In [35]:
# 타겟 컬럼 분리
train_1 = train['class']
train_1

0         normal
1         normal
2        anomaly
3         normal
4         normal
          ...   
25187    anomaly
25188    anomaly
25189    anomaly
25190    anomaly
25191    anomaly
Name: class, Length: 25192, dtype: object

In [44]:
train_1 = le.fit_transform(train_1)
train_1

array([1, 1, 0, ..., 0, 0, 0])

In [47]:
train_enc = train_enc.drop('class', axis=1)
train_enc

KeyError: "['class'] not found in axis"

In [48]:
train_enc

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,1,19,9,414,0,0,0,0,0,...,150,25,17,3,17,0,0,0,5,0
1,0,2,41,9,126,0,0,0,0,0,...,255,1,0,60,88,0,0,0,0,0
2,0,1,46,5,0,0,0,0,0,0,...,255,26,10,5,0,0,99,87,0,0
3,0,1,22,9,212,2892,0,0,0,0,...,30,255,100,0,3,4,3,1,0,1
4,0,1,22,9,179,362,0,0,0,0,...,255,255,100,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,0,1,16,2,0,0,0,0,0,0,...,255,7,3,6,0,0,0,0,100,99
25188,0,1,19,9,314,0,0,0,0,0,...,1,39,100,0,100,18,0,0,0,0
25189,0,1,46,1,0,0,0,0,0,0,...,255,13,5,7,0,0,0,0,100,99
25190,0,1,38,5,0,0,0,0,0,0,...,255,20,8,6,0,0,99,87,0,0


### Scaling
- Min/Max Scaler
- Standard Scaler

In [38]:
train['dst_bytes'].value_counts()

0        13574
105        309
8314       175
44         115
42         105
         ...  
1324         1
16225        1
16193        1
3895         1
56           1
Name: dst_bytes, Length: 3922, dtype: int64

In [39]:
train['src_bytes'].value_counts()

0       9866
8        738
1        480
44       467
45       416
        ... 
3414       1
1303       1
1287       1
3334       1
1983       1
Name: src_bytes, Length: 1665, dtype: int64

In [49]:
# 최대최솟값만을 변경하는 MinMaxScaler보다는 값의 격차를 줄이기 위한 비례 스케일링이 알맞은 것 같기에 StandardScaler 선택
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [50]:
scaled_1 = scaler.fit_transform(train_enc)
scaled_1

array([[-0.16030767, -0.12606067, -0.64538418, ..., -0.62797373,
        -0.22166806, -0.37423402],
       [-0.16030767,  2.21591606,  0.76892549, ..., -0.62797373,
        -0.38513984, -0.37423402],
       [-0.16030767, -0.12606067,  1.09035951, ...,  1.61428581,
        -0.38513984, -0.37423402],
       ...,
       [-0.16030767, -0.12606067,  1.09035951, ..., -0.62797373,
         2.88429591,  2.7778005 ],
       [-0.16030767, -0.12606067,  0.57606508, ...,  1.61428581,
        -0.38513984, -0.37423402],
       [-0.16030767, -0.12606067, -0.77395779, ...,  1.61428581,
        -0.38513984, -0.37423402]])

In [51]:
pd.DataFrame(scaled_1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,-0.160308,-0.126061,-0.645384,0.753021,0.612944,-0.543381,-0.00891,-0.092819,-0.006301,-0.099885,...,-0.328634,-0.813985,-0.779157,-0.280673,0.073120,-0.316468,-0.641870,-0.627974,-0.221668,-0.374234
1,-0.160308,2.215916,0.768925,0.753021,-0.220338,-0.543381,-0.00891,-0.092819,-0.006301,-0.099885,...,0.732059,-1.030895,-1.157831,2.764403,2.375620,-0.316468,-0.641870,-0.627974,-0.385140,-0.374234
2,-0.160308,-0.126061,1.090360,-0.739924,-0.584899,-0.543381,-0.00891,-0.092819,-0.006301,-0.099885,...,0.732059,-0.804947,-0.935081,-0.173828,-0.478183,-0.316468,1.603971,1.614286,-0.385140,-0.374234
3,-0.160308,-0.126061,-0.452524,0.753021,0.028489,2.517447,-0.00891,-0.092819,-0.006301,-0.099885,...,-1.540854,1.264742,1.069663,-0.440940,-0.380894,0.115363,-0.573814,-0.602201,-0.385140,-0.342395
4,-0.160308,-0.126061,-0.452524,0.753021,-0.066991,-0.160249,-0.00891,-0.092819,-0.006301,-0.099885,...,0.732059,1.264742,1.069663,-0.440940,-0.478183,-0.316468,-0.641870,-0.627974,-0.385140,-0.374234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,-0.160308,-0.126061,-0.838245,-1.859633,-0.584899,-0.543381,-0.00891,-0.092819,-0.006301,-0.099885,...,0.732059,-0.976667,-1.091006,-0.120406,-0.478183,-0.316468,-0.641870,-0.627974,2.884296,2.777801
25188,-0.160308,-0.126061,-0.645384,0.753021,0.323610,-0.543381,-0.00891,-0.092819,-0.006301,-0.099885,...,-1.833807,-0.687453,1.069663,-0.440940,2.764775,1.626770,-0.641870,-0.627974,-0.385140,-0.374234
25189,-0.160308,-0.126061,1.090360,-2.232869,-0.584899,-0.543381,-0.00891,-0.092819,-0.006301,-0.099885,...,0.732059,-0.922440,-1.046456,-0.066984,-0.478183,-0.316468,-0.641870,-0.627974,2.884296,2.777801
25190,-0.160308,-0.126061,0.576065,-0.739924,-0.584899,-0.543381,-0.00891,-0.092819,-0.006301,-0.099885,...,0.732059,-0.859174,-0.979631,-0.120406,-0.478183,-0.316468,1.603971,1.614286,-0.385140,-0.374234
