# 컬럼 설명

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../Train_data.csv')

# 데이터 info

# X_cat, X_num, target분리

In [3]:
# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
enc_dict={}
def encoding_label(x):
    le = LabelEncoder()
    r = le.fit_transform(x)
    enc_dict[x.name] = le.classes_
    return r

In [4]:
# 원핫 인코딩
X_cat = df[['protocol_type', 'service', 'flag']]
# X_cat = df.select_dtypes(include=['object'])
X_cat = pd.get_dummies(X_cat)

# 스케일링 + 데이터프레임 합치기

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_num = df.drop(['protocol_type', 'service', 'flag', 'class'], axis=1)
scaler.fit(X_num)
X_scaled = scaler.transform(X_num)
X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)

X = pd.concat([X_scaled, X_cat], axis=1)

y = df[['class']]#.apply(encoding_label)

In [7]:
X, y

(       duration  src_bytes  dst_bytes  ...  flag_S3  flag_SF  flag_SH
 0     -0.113551  -0.009889  -0.039310  ...        0        1        0
 1     -0.113551  -0.010032  -0.039310  ...        0        1        0
 2     -0.113551  -0.010093  -0.039310  ...        0        0        0
 3     -0.113551  -0.009996   0.052473  ...        0        1        0
 4     -0.113551  -0.010010  -0.034582  ...        0        1        0
 ...         ...        ...        ...  ...      ...      ...      ...
 25187 -0.113551  -0.010093  -0.039310  ...        0        0        0
 25188 -0.113551  -0.009954  -0.039310  ...        0        1        0
 25189 -0.113551  -0.010093  -0.039310  ...        0        0        0
 25190 -0.113551  -0.010093  -0.039310  ...        0        0        0
 25191 -0.113551  -0.010093  -0.039310  ...        0        0        0
 
 [25192 rows x 118 columns],        class
 0          1
 1          1
 2          0
 3          1
 4          1
 ...      ...
 25187      0
 25188

# coef_

In [None]:
x1 = np.argwhere(abs(model_lr.coef_) > 5)
x1

# 로지스틱 리그레션

## 로지스틱 리그레션 주요 파라미터
- penalty와 C가 있다.
- penalty는 규제의 유형을 설정하며 'l2'로 설정 시 L2 규제를, 'l1'으로 설정 시 L1 규제를 뜻한다. 기본은 'l2'이다.
- C는 규제 강도를 조절하는 alpha 값의 역수이다. C 값이 작을 수록 규제 강도가 크다.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
model_lr = LogisticRegression(max_iter=500)
model_lr.fit(X_train, y_train.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [89]:
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix, plot_confusion_matrix

In [91]:
pred = model_lr.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9715533209843874
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      3507
           1       0.97      0.98      0.97      4051

    accuracy                           0.97      7558
   macro avg       0.97      0.97      0.97      7558
weighted avg       0.97      0.97      0.97      7558



# GridSearchCV

In [77]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

model_lr = LogisticRegression(max_iter=1000)

param_grid = {'penalty' : ['l2', 'l1'], 'C' : [0.01, 0.1, 1, 5, 10]}

grid_search = GridSearchCV(model_lr, param_grid=param_grid, scoring=['accuracy', 'recall', 'precision'], refit='accuracy', cv=5, n_jobs=-1)
# grid_search = GridSearchCV(model_lr, #학습시킬 모델
#                            param_grid=param_grid, #하이퍼파라미터 후보
# #                            scoring="accuracy", #평가 지표
#                            scoring=['accuracy', 'recall', 'precision'], #평가지표를 여러개 지정시 리스트로 묶어준다.
#                            refit="accuracy", #평가지표가 여러개일때 어떤 지표를 기준으로 best_estimator를 만들것인지 지정.
#                            cv=5, #교차검증(Cross Validation)의 folder 개수(몇개로 나눌 것인지.)
#                            n_jobs=-1)

In [78]:
# 학습(train) - 최적의 하이퍼파라미터 조합
grid_search.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 5, 10], 'penalty': ['l2', 'l1']},
             pre_dispatch='2*n_jobs', refit='accuracy',
             return_train_score=False,
             scoring=['accuracy', 'recall', 'precision'], verbose=0)

In [79]:
from sklearn.metrics import accuracy_score
pred = grid_search.predict(X_test) 
accuracy_score(y_test, pred)

0.9744641439534268

In [81]:
# 하이퍼파라미터 조합별 결과
results = pd.DataFrame(grid_search.cv_results_)#.sort_values('')
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_recall,split1_test_recall,split2_test_recall,split3_test_recall,split4_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,split4_test_precision,mean_test_precision,std_test_precision,rank_test_precision
0,0.340194,0.013823,0.011078,0.000928,0.01,l2,"{'C': 0.01, 'penalty': 'l2'}",0.961724,0.959739,0.968528,0.965693,0.970788,0.965295,0.004109,5,0.975519,0.976064,0.981915,0.978191,0.984034,0.979145,0.00332,5,0.953694,0.949793,0.959958,0.958312,0.962019,0.956755,0.004432,5
1,0.018517,0.000579,0.0,0.0,0.01,l1,"{'C': 0.01, 'penalty': 'l1'}",,,,,,,,6,,,,,,,,6,,,,,,,,6
2,0.743033,0.045632,0.012173,0.001918,0.1,l2,"{'C': 0.1, 'penalty': 'l2'}",0.967111,0.96541,0.973348,0.968528,0.973057,0.969491,0.003189,4,0.978712,0.980851,0.983511,0.97766,0.983502,0.980847,0.002403,4,0.960313,0.95544,0.96705,0.963818,0.966527,0.96263,0.004317,4
3,0.018247,0.000456,0.0,0.0,0.1,l1,"{'C': 0.1, 'penalty': 'l1'}",,,,,,,,7,,,,,,,,7,,,,,,,,7
4,1.607542,0.137153,0.010925,0.000333,1.0,l2,"{'C': 1, 'penalty': 'l2'}",0.970797,0.967961,0.974766,0.973632,0.973625,0.972156,0.002473,3,0.980841,0.980851,0.984043,0.981915,0.984566,0.982443,0.001578,2,0.964921,0.959917,0.969094,0.969029,0.966562,0.965905,0.003382,3
5,0.019704,0.001121,0.0,0.0,1.0,l1,"{'C': 1, 'penalty': 'l1'}",,,,,,,,8,,,,,,,,8,,,,,,,,8
6,2.730086,0.12704,0.011067,0.00035,5.0,l2,"{'C': 5, 'penalty': 'l2'}",0.971931,0.968245,0.976467,0.97505,0.976461,0.973631,0.003161,2,0.981373,0.978723,0.985638,0.981383,0.985098,0.982443,0.002583,1,0.966457,0.962343,0.970665,0.972076,0.971144,0.968537,0.003648,2
7,0.017792,0.000377,0.0,0.0,5.0,l1,"{'C': 5, 'penalty': 'l1'}",,,,,,,,9,,,,,,,,9,,,,,,,,9
8,3.517583,0.290246,0.01031,0.001804,10.0,l2,"{'C': 10, 'penalty': 'l2'}",0.971931,0.968812,0.976751,0.976184,0.976177,0.973971,0.003107,1,0.980309,0.978723,0.985106,0.982447,0.985098,0.982337,0.002549,3,0.967437,0.963351,0.971668,0.97313,0.970635,0.969244,0.00349,1
9,0.01993,0.002406,0.0,0.0,10.0,l1,"{'C': 10, 'penalty': 'l1'}",,,,,,,,10,,,,,,,,10,,,,,,,,10


In [None]:
best_param = grid_search.best_params_
best_param

In [18]:
best_estimator = grid_search.best_estimator_
best_estimator

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
best_score = grid_search.best_score_
best_score

In [19]:
pred = best_estimator.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9744641439534268
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      3507
           1       0.97      0.98      0.98      4051

    accuracy                           0.97      7558
   macro avg       0.98      0.97      0.97      7558
weighted avg       0.97      0.97      0.97      7558



# SVM

## 주요 파라미터 
- ``C``는 과적합이면 훈련셋에 타이트하게 맞춘 것이므로 오차허용을 좀 늘려서 공간을 확보해야 하므로 값을 줄인다. (작은 값일 수록 많이 허용)
과소적합이면 너무 오차허용을크게 잡은 것이므로 오차허용을 줄여야 하므로 값을 늘린다. (큰값은 적게 허용)
- ``gamma`` 방사 기저함수 공식상 감마가 크면 반환값은 작아지고 감마가 작으면 반환값은 커진다. ( −γ  를 곱하므로)
- 감마가 작을 수록 값들의 거리가 멀어지고(큰값이 결과로 나오므로) 클 수록 거리가 가까워진다. 그래서 ``gamma`` 가 크면 거리가 타이트해져 과적합이 일어날 수있다. (공간의 여유가 없므으로)

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [10]:
svc = SVC(kernel='linear', C=0.01, random_state=1)
svc.fit(X_train, y_train.values.ravel())

SVC(C=0.01, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001,
    verbose=False)

In [22]:
pred = svc.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9711563905795184
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      3507
           1       0.96      0.98      0.97      4051

    accuracy                           0.97      7558
   macro avg       0.97      0.97      0.97      7558
weighted avg       0.97      0.97      0.97      7558



In [23]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [24]:
grid_search = GridSearchCV(svc, param_grid=param_grid, scoring=['accuracy', 'recall', 'precision'], refit="accuracy", cv=5, return_train_score=True)

In [25]:
grid_search.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=0.01, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=1, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit='accuracy', return_train_score=True,
             scoring=['accuracy', 'recall', 'precision'], verbose=0)

In [26]:
best_param = grid_search.best_params_
best_param

{'C': 100, 'gamma': 0.001}

In [27]:
best_estimator = grid_search.best_estimator_
best_estimator

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
    max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001,
    verbose=False)

In [28]:
best_score = grid_search.best_score_
best_score

0.9756719776664934

In [29]:
pred = grid_search.predict(X_test) 
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9760518655729029
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      3507
           1       0.97      0.98      0.98      4051

    accuracy                           0.98      7558
   macro avg       0.98      0.98      0.98      7558
weighted avg       0.98      0.98      0.98      7558



# 랜덤 포레스트
## 주요 파라미터
### n_estimators
- 결정트리의 갯수를 지정
- Default = 10
- 무작정 트리 갯수를 늘리면 성능 좋아지는 것 대비 시간이 걸릴 수 있음
### min_samples_split
- 노드를 분할하기 위한 최소한의 샘플 데이터수 → 과적합을 제어하는데 사용
- Default = 2 → 작게 설정할 수록 분할 노드가 많아져 과적합 가능성 증가

### min_samples_leaf
- 리프노드가 되기 위해 필요한 최소한의 샘플 데이터수
- min_samples_split과 함께 과적합 제어 용도
- 불균형 데이터의 경우 특정 클래스의 데이터가 극도로 작을 수 있으므로 작게 설정 필요

### max_features
- 최적의 분할을 위해 고려할 최대 feature 개수
- Default = 'auto' (결정트리에서는 default가 none이었음)
- int형으로 지정 →피처 갯수 / float형으로 지정 →비중
- sqrt 또는 auto : 전체 피처 중 √(피처개수) 만큼 선정
- log : 전체 피처 중 log2(전체 피처 개수) 만큼 선정

### max_depth
- 트리의 최대 깊이
- default = None → 완벽하게 클래스 값이 결정될 때 까지 분할
또는 데이터 개수가 min_samples_split보다 작아질 때까지 분할
- 깊이가 깊어지면 과적합될 수 있으므로 적절히 제어 필요

### max_leaf_nodes
- 리프노드의 최대 개수

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model_rf = RandomForestClassifier(max_depth=5)
model_rf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
pred = model_rf.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9794919290817676
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      3507
           1       0.97      1.00      0.98      4051

    accuracy                           0.98      7558
   macro avg       0.98      0.98      0.98      7558
weighted avg       0.98      0.98      0.98      7558



In [49]:
from sklearn.model_selection import GridSearchCV

param_grid = { 'n_estimators' : [10, 100],
              'max_depth' : [5, 7, 9],
              'min_samples_leaf' : [8, 12, 18],
              'min_samples_split' : [8, 16, 20]
              }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
# model_rf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_search = GridSearchCV(model_rf, param_grid=param_grid, cv=3, n_jobs=-1)

In [50]:
grid_search.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=5,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_s

In [51]:
best_param = grid_search.best_params_
best_param

{'max_depth': 9,
 'min_samples_leaf': 8,
 'min_samples_split': 8,
 'n_estimators': 10}

In [52]:
best_estimator = grid_search.best_estimator_
best_estimator

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=9, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [53]:
best_score = grid_search.best_score_
best_score

0.9902461154587728

In [54]:
pred = grid_search.predict(X_test) 
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9850489547499338
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      3507
           1       0.98      1.00      0.99      4051

    accuracy                           0.99      7558
   macro avg       0.99      0.98      0.98      7558
weighted avg       0.99      0.99      0.99      7558



In [13]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

models = []
models.append(('LogisticRegression', model_lr))
models.append(('RandomForestClassifier', model_rf))
models.append(('SVC', svc))

for i, v in models:
    scores = cross_val_score(v, X_train, y_train.values.ravel(), cv=10)
    accuracy = metrics.accuracy_score(y_train.values.ravel(), v.predict(X_train))
    confusion_matrix = metrics.confusion_matrix(y_train.values.ravel(), v.predict(X_train))
    classification = metrics.classification_report(y_train.values.ravel(), v.predict(X_train))
    print()
    print('============================== {} Model Evaluation =============================='.format(i))
    print()
    print ("Cross Validation Mean Score:" "\n", scores.mean())
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()



Cross Validation Mean Score:
 0.9723267904249997

Model Accuracy:
 0.9739140297153227

Confusion matrix:
 [[7924  312]
 [ 148 9250]]

Classification report:
               precision    recall  f1-score   support

     anomaly       0.98      0.96      0.97      8236
      normal       0.97      0.98      0.98      9398

    accuracy                           0.97     17634
   macro avg       0.97      0.97      0.97     17634
weighted avg       0.97      0.97      0.97     17634




Cross Validation Mean Score:
 0.979415337698702

Model Accuracy:
 0.9784507201996144

Confusion matrix:
 [[7914  322]
 [  58 9340]]

Classification report:
               precision    recall  f1-score   support

     anomaly       0.99      0.96      0.98      8236
      normal       0.97      0.99      0.98      9398

    accuracy                           0.98     17634
   macro avg       0.98      0.98      0.98     17634
weighted avg       0.98      0.98      0.98     17634




Cross Validation Mean S

In [14]:
for i, v in models:
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    classification = metrics.classification_report(y_test, v.predict(X_test))
    print()
    print('============================== {} Model Test Results =============================='.format(i))
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()        



Model Accuracy:
 0.9715533209843874

Confusion matrix:
 [[3363  144]
 [  71 3980]]

Classification report:
               precision    recall  f1-score   support

     anomaly       0.98      0.96      0.97      3507
      normal       0.97      0.98      0.97      4051

    accuracy                           0.97      7558
   macro avg       0.97      0.97      0.97      7558
weighted avg       0.97      0.97      0.97      7558




Model Accuracy:
 0.978036517597248

Confusion matrix:
 [[3364  143]
 [  23 4028]]

Classification report:
               precision    recall  f1-score   support

     anomaly       0.99      0.96      0.98      3507
      normal       0.97      0.99      0.98      4051

    accuracy                           0.98      7558
   macro avg       0.98      0.98      0.98      7558
weighted avg       0.98      0.98      0.98      7558




Model Accuracy:
 0.9711563905795184

Confusion matrix:
 [[3351  156]
 [  62 3989]]

Classification report:
               p