# 컬럼 설명

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../Train_data.csv')

# 데이터 info

# X_cat, X_num, target분리

In [3]:
# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
enc_dict={}
def encoding_label(x):
    le = LabelEncoder()
    r = le.fit_transform(x)
    enc_dict[x.name] = le.classes_
    return r

In [4]:
# 원핫 인코딩
X_cat = df[['protocol_type', 'service', 'flag']]
# X_cat = df.select_dtypes(include=['object'])
X_cat = pd.get_dummies(X_cat)

# 스케일링 + 데이터프레임 합치기

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_num = df.drop(['protocol_type', 'service', 'flag', 'class'], axis=1)
scaler.fit(X_num)
X_scaled = scaler.transform(X_num)
X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)

X = pd.concat([X_scaled, X_cat], axis=1)

y = df[['class']].apply(encoding_label)

In [6]:
X, y

(       duration  src_bytes  dst_bytes     land  wrong_fragment    urgent  \
 0     -0.113551  -0.009889  -0.039310 -0.00891       -0.091223 -0.006301   
 1     -0.113551  -0.010032  -0.039310 -0.00891       -0.091223 -0.006301   
 2     -0.113551  -0.010093  -0.039310 -0.00891       -0.091223 -0.006301   
 3     -0.113551  -0.009996   0.052473 -0.00891       -0.091223 -0.006301   
 4     -0.113551  -0.010010  -0.034582 -0.00891       -0.091223 -0.006301   
 ...         ...        ...        ...      ...             ...       ...   
 25187 -0.113551  -0.010093  -0.039310 -0.00891       -0.091223 -0.006301   
 25188 -0.113551  -0.009954  -0.039310 -0.00891       -0.091223 -0.006301   
 25189 -0.113551  -0.010093  -0.039310 -0.00891       -0.091223 -0.006301   
 25190 -0.113551  -0.010093  -0.039310 -0.00891       -0.091223 -0.006301   
 25191 -0.113551  -0.010093  -0.039310 -0.00891       -0.091223 -0.006301   
 
             hot  num_failed_logins  logged_in  num_compromised  ...  flag

# 로지스틱 리그레션

## 로지스틱 리그레션 주요 파라미터
- penalty와 C가 있다.
- penalty는 규제의 유형을 설정하며 'l2'로 설정 시 L2 규제를, 'l1'으로 설정 시 L1 규제를 뜻한다. 기본은 'l2'이다.
- C는 규제 강도를 조절하는 alpha 값의 역수이다. C 값이 작을 수록 규제 강도가 크다.

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

model_lr = LogisticRegression(max_iter=500)
model_lr.fit(X_train, y_train)


  return f(**kwargs)


LogisticRegression(max_iter=500)

In [9]:
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix, plot_confusion_matrix

In [10]:
pred = model_lr.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9715533209843874
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      3507
           1       0.97      0.98      0.97      4051

    accuracy                           0.97      7558
   macro avg       0.97      0.97      0.97      7558
weighted avg       0.97      0.97      0.97      7558



# coef_

In [None]:
x1 = np.argwhere(abs(model_lr.coef_) > 5)
x1

# GridSearchCV

In [11]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

model_lr = LogisticRegression(max_iter=1000)

param_grid = {'penalty' : ['l2', 'l1'], 'C' : [0.01, 0.1, 1, 5, 10]}

grid_search = GridSearchCV(model_lr, #학습시킬 모델
                           param_grid=param_grid, #하이퍼파라미터 후보
#                            scoring="accuracy", #평가 지표
                           scoring=['accuracy', 'recall', 'precision'], #평가지표를 여러개 지정시 리스트로 묶어준다.
                           refit="accuracy", #평가지표가 여러개일때 어떤 지표를 기준으로 best_estimator를 만들것인지 지정.
                           cv=5, #교차검증(Cross Validation)의 folder 개수(몇개로 나눌 것인지.)
                           n_jobs=-1)

In [12]:
# 학습(train) - 최적의 하이퍼파라미터 조합
grid_search.fit(X_train, y_train)

  return f(**kwargs)


GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=1000), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 5, 10], 'penalty': ['l2', 'l1']},
             refit='accuracy', scoring=['accuracy', 'recall', 'precision'])

In [14]:
from sklearn.metrics import accuracy_score
pred = grid_search.predict(X_test) #가장 성능이 잘나온 하이퍼 파라미터를 가진 모델(tree)로 예측.
accuracy_score(y_test, pred)

0.9743318338184705

In [15]:
# 하이퍼파라미터 조합별 결과
df = pd.DataFrame(grid_search.cv_results_)#.sort_values('rank_test_score')
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,std_test_recall,rank_test_recall,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,split4_test_precision,mean_test_precision,std_test_precision,rank_test_precision
0,0.303225,0.011756,0.011001,2e-06,0.01,l2,"{'C': 0.01, 'penalty': 'l2'}",0.961724,0.959739,0.968528,...,0.00332,5,0.953694,0.949793,0.959958,0.958312,0.962019,0.956755,0.004432,5
1,0.015601,0.001356,0.0,0.0,0.01,l1,"{'C': 0.01, 'penalty': 'l1'}",,,,...,,6,,,,,,,,6
2,0.664251,0.039249,0.010601,0.000489,0.1,l2,"{'C': 0.1, 'penalty': 'l2'}",0.967111,0.96541,0.973348,...,0.002403,4,0.960313,0.95544,0.96705,0.963818,0.966527,0.96263,0.004317,4
3,0.014401,0.0008,0.0,0.0,0.1,l1,"{'C': 0.1, 'penalty': 'l1'}",,,,...,,7,,,,,,,,7
4,1.419915,0.089815,0.0108,0.0004,1.0,l2,"{'C': 1, 'penalty': 'l2'}",0.970797,0.967961,0.974766,...,0.001578,2,0.964921,0.959917,0.969094,0.969029,0.966562,0.965905,0.003382,3
5,0.017402,0.004317,0.0,0.0,1.0,l1,"{'C': 1, 'penalty': 'l1'}",,,,...,,8,,,,,,,,8
6,2.510402,0.154419,0.010603,0.000488,5.0,l2,"{'C': 5, 'penalty': 'l2'}",0.971931,0.968245,0.976467,...,0.002583,1,0.966457,0.962343,0.970665,0.972076,0.971144,0.968537,0.003648,2
7,0.014401,0.000489,0.0,0.0,5.0,l1,"{'C': 5, 'penalty': 'l1'}",,,,...,,9,,,,,,,,9
8,2.996638,0.313229,0.013601,0.008238,10.0,l2,"{'C': 10, 'penalty': 'l2'}",0.971931,0.968812,0.976751,...,0.002549,3,0.967437,0.963351,0.971668,0.97313,0.970635,0.969244,0.00349,1
9,0.015,0.000896,0.0,0.0,10.0,l1,"{'C': 10, 'penalty': 'l1'}",,,,...,,10,,,,,,,,10


In [16]:
# 최적의 하이퍼파라미터
best_param = grid_search.best_params_
best_param

{'C': 10, 'penalty': 'l2'}

In [17]:
best_estimator = grid_search.best_estimator_
print(best_estimator)

LogisticRegression(C=10, max_iter=1000)


In [18]:
pred_test = best_estimator.predict(X_test)
accuracy_score(y_test, pred_test)

0.9743318338184705

# SVM

## 주요 파라미터 
- C는 과적합이면 훈련셋에 타이트하게 맞춘 것이므로 오차허용을 좀 늘려서 공간을 확보해야 하므로 값을 줄인다. (작은 값일 수록 많이 허용)
과소적합이면 너무 오차허용을크게 잡은 것이므로 오차허용을 줄여야 하므로 값을 늘린다. (큰값은 적게 허용)
- gamma 방사 기저함수 공식상 감마가 크면 반환값은 작아지고 감마가 작으면 반환값은 커진다. ( −γ  를 곱하므로)
- 감마가 작을 수록 값들의 거리가 멀어지고(큰값이 결과로 나오므로) 클 수록 거리가 가까워진다. 그래서 gamma 가 크면 거리가 타이트해져 과적합이 일어날 수있다. (공간의 여유가 없므으로)

In [19]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [20]:
svc = SVC(kernel='linear', C=0.01, random_state=1)
svc.fit(X_train, y_train)

  return f(**kwargs)


SVC(C=0.01, kernel='linear', random_state=1)

In [21]:
pred = svc.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9711563905795184
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      3507
           1       0.96      0.98      0.97      4051

    accuracy                           0.97      7558
   macro avg       0.97      0.97      0.97      7558
weighted avg       0.97      0.97      0.97      7558



In [22]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [23]:
grid_search = GridSearchCV(svc, param_grid=param_grid, scoring=['accuracy', 'recall', 'precision'], refit="accuracy", cv=5, return_train_score=True)

In [None]:
grid_search.fit(X_train, y_train)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


In [None]:
# 최적의 하이퍼파라미터
best_param = grid_search.best_params_
best_param

In [None]:
pred = grid_search.predict(X_test) #가장 성능이 잘나온 하이퍼 파라미터를 가진 모델(tree)로 예측.
accuracy_score(y_test, pred)

# 랜덤 포레스트

# 주요 파라미터
## n_estimators
- 결정트리의 갯수를 지정
- Default = 10
- 무작정 트리 갯수를 늘리면 성능 좋아지는 것 대비 시간이 걸릴 수 있음
## min_samples_split
- 노드를 분할하기 위한 최소한의 샘플 데이터수 → 과적합을 제어하는데 사용
- Default = 2 → 작게 설정할 수록 분할 노드가 많아져 과적합 가능성 증가

## min_samples_leaf
- 리프노드가 되기 위해 필요한 최소한의 샘플 데이터수
- min_samples_split과 함께 과적합 제어 용도
- 불균형 데이터의 경우 특정 클래스의 데이터가 극도로 작을 수 있으므로 작게 설정 필요

## max_features
- 최적의 분할을 위해 고려할 최대 feature 개수
- Default = 'auto' (결정트리에서는 default가 none이었음)
- int형으로 지정 →피처 갯수 / float형으로 지정 →비중
- sqrt 또는 auto : 전체 피처 중 √(피처개수) 만큼 선정
- log : 전체 피처 중 log2(전체 피처 개수) 만큼 선정

## max_depth
- 트리의 최대 깊이
- default = None → 완벽하게 클래스 값이 결정될 때 까지 분할
또는 데이터 개수가 min_samples_split보다 작아질 때까지 분할
- 깊이가 깊어지면 과적합될 수 있으므로 적절히 제어 필요

## max_leaf_nodes
- 리프노드의 최대 개수

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train, y_train)

pred = rf_clf.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

In [None]:
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))