In [1]:
from sklearn.ensemble import GradientBoostingClassifier
import warnings
import pandas as pd
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

In [2]:
# from 04_02
def get_new_df(old_df):  
    dup_df = pd.DataFrame(data= old_df.groupby('column_name').cumcount(),columns=['dup_cnt'])
    dup_df = dup_df.reset_index()
    new_df = pd.merge(old_df.reset_index(), dup_df, how='outer')
    new_df['column_name'] = new_df[['column_name','dup_cnt']].apply(lambda x:x[0]+'_'+str(x[1]) if x[1]>0 else x[0],axis=1)
    new_df.drop(columns=['index'],inplace=True)
    return new_df

def get_human_dataset() :
    feature_name_df = pd.read_csv('human_activity/features.txt',
                 sep = '\s+',
                 header = None,
                 names = ['column_index','column_name'])
    
    name_df = get_new_df(feature_name_df)
    feature_name = name_df.iloc[:,1].values.tolist()
    X_train = pd.read_csv('human_activity/train/X_train.txt',
                          sep = '\s+',
                          names = feature_name)
    X_test = pd.read_csv('human_activity/test/X_test.txt',
                          sep = '\s+',
                          names = feature_name)
    y_train = pd.read_csv('human_activity/train/y_train.txt',
                          sep = '\s+',
                          names = ['action'])
    y_test = pd.read_csv('human_activity/test/y_test.txt',
                          sep = '\s+',
                          names = ['action'])
    
    return X_train, X_test, y_train, y_test

In [3]:
X_train, X_test, y_train, y_test = get_human_dataset()

In [4]:
%%time
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
pred = gb_clf.predict(X_test)
accuracy_score(y_test,pred)

Wall time: 9min 14s


0.9382422802850356

In [5]:
# 하이퍼 파라미터 사용 없이 높은 정확도

In [6]:
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
import pandas as pd
import numpy as np


from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'xgboost'

In [None]:
xgb.__version__

In [None]:
dataset = load_breast_cancer(as_frame = True)

In [None]:
dataset.data

In [None]:
dataset.target

In [None]:
dataset.target_names
# 악성 종양, 양성 종양

In [None]:
dataset.target.value_counts()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(dataset.data,dataset.target,test_size=0.2,random_state=156)
X_tr, X_val, y_tr, y_val = train_test_split(X_train,y_train,test_size=0.1,random_state=156)
# 학습용 데이터를 한번 더 나누었다.

In [None]:
X_train.shape,X_test.shape

In [None]:
X_tr.shape, X_val.shape

In [None]:
y_train.value_counts()

In [None]:
dtr = xgb.DMatrix(data=X_tr, label=y_tr)
dval = xgb.DMatrix(data=X_val, label = y_val)
dtest = xgb.DMatrix(data=X_test, label = y_test)
# 학습, 검증, 테스트 데이타

In [None]:
XGBClassifier()
# * 키 밸류 값으로 넣으면 돈다.

In [None]:
# 모델 설정용
params = {
    'max_depth' : 3,
    'eta' : 0.05,
    'objective' : 'binary:logistic',
    'eval_metric' : 'logloss'
}
num_rounds = 500
eval_list= [(dtr,'train'),(dval,'eval')]
# (학습 데이타),(검증용 데이타)

In [None]:
# 모델 학습
model = xgb.train(params,dtr, num_rounds,evals = eval_list,early_stopping_rounds= 50)
#num_rounds 400회 반복 하겠다.
#early_stopping_rounds 조기 종료하겠다. 50회 이후에도 성능에 개선이 없다면 그만 두겠다.

In [None]:
pred_probs = model.predict(dtest)
pred_probs 
#1이 될 확률

In [None]:
np.round(pred_probs,3)

In [None]:
pred = [1 if x> 0.5 else 0 for x in pred_probs ]

In [None]:
def get_clf_eval(y_test,pred,pred_proba_1):
    from sklearn.metrics import accuracy_score, precision_score, recall_score,confusion_matrix,f1_score,roc_auc_score
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    #정확도
    precision = precision_score(y_test,pred)
    #정밀도
    recall = recall_score(y_test,pred)
    #재현율
    f1= f1_score(y_test,pred)
    auc = roc_auc_score(y_test,pred_proba_1)
    print('오차행렬')
    print(confusion)
    print(f'정확도 : {accuracy:.4f}, 정밀도 : {precision:.4f}, 재현율 : {recall:.4f}, F1 : {f1:.4f}, AUC : {auc:.4f}')
    

In [None]:
get_clf_eval(y_test,pred,pred_probs)

In [None]:
plot_importance(model)

In [None]:
from xgboost import XGBClassifier

In [None]:
# 위에서 xgb.train 했던 것과 같은 모델???
model = XGBClassifier(n_estimators = 500, learning_rate = 0.05, max_depth=3,eval_metric='logloss')

In [None]:
model.fit(X_train,y_train,verbose=True)

In [None]:
pred = model.predict(X_test)

In [None]:
pred

In [None]:
pred_proba = model.predict_proba(X_test)
pred_proba[:,1]

In [None]:
get_clf_eval(y_test,pred,pred_proba[:,1])

In [None]:
model = XGBClassifier(n_estimators = 500, learning_rate = 0.05, max_depth=3)
evals=[(X_tr,y_tr),(X_val,y_val)]
model.fit(X_tr,
          y_tr,
          verbose=True,
          eval_set=evals,
          early_stopping_rounds=50,
          eval_metric='logloss')
pred = model.predict(X_test)
pred_proba = model.predict_proba(X_test)
get_clf_eval(y_test,pred,pred_proba[:,1])
# 126 번으로 이후 50횟수 동안 별다른 성능 향상은 없다.
# 조기 종료 설정 횟수를 너무 작게 주어도 않된다.

In [None]:
from xgboost import to_graphviz
to_graphviz(model)