# 새로운 버섯이 독성인지 식용인지 예측 프로젝트

## 1. 필요한 라이브러리 불러오기

### 열 설명
id: 각 버섯의 고유 식별자.  
class: 독성 여부, 'e' (식용 가능), 'p' (독성).   
cap-diameter: 버섯의 갓 지름 (숫자형, cm).  
cap-shape: 갓 모양 (예: f, x, p 등).  
cap-surface: 갓 표면의 질감 (예: s, y, h 등).  
cap-color: 갓의 색상 (예: u, o, n 등).  
does-bruise-or-bleed: 멍이 들거나 진물이 나오는지 (예: f, t).  
gill-attachment: 주름이 자라는 방식 (예: a, s).  
gill-spacing: 주름 간격 (예: c, w 등).  
gill-color: 주름의 색상 (예: w, g 등).  
stem-height: 줄기의 높이 (숫자형, cm).  
stem-width: 줄기의 너비 (숫자형, cm).  
stem-root: 줄기의 뿌리 형태.  
stem-surface: 줄기의 표면 질감.  
stem-color: 줄기의 색상.  
veil-type: 줄기를 감싸는 얇은 막의 유형.  
veil-color: 막의 색상.  
has-ring: 줄기 주변의 고리가 있는지 여부.  
ring-type: 고리의 종류.  
spore-print-color: 포자 색상.  
habitat: 서식지 유형 (예: d, l, g 등).  
season: 계절 (예: a, w 등).  

In [3]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold 

## 2. Train과 Test 데이터셋 읽기

In [5]:
train = pd.read_csv('./data/train.csv', index_col='id')
test = pd.read_csv('./data/test.csv', index_col='id')
sample_submission = pd.read_csv('./data/sample_submission.csv')

## 3. 전처리

In [6]:
initial_features = list(test.columns)

float_features = ['cap-diameter', 'stem-height', 'stem-width']
cat_features = [f for f in initial_features if f not in float_features]
for feature in initial_features:
    if feature in cat_features:
        dtype = pd.CategoricalDtype(categories=sorted(list(set(train[feature].dropna()) | set(test[feature].dropna()))),
                                    ordered=False)
        print(f"{feature:30} {len(dtype.categories)}")
    else:
        dtype = np.float32
    train[feature] = train[feature].astype(dtype)
    if True:
        test[feature] = test[feature].astype(dtype)

cap-shape                      108
cap-surface                    114
cap-color                      109
does-bruise-or-bleed           29
gill-attachment                117
gill-spacing                   66
gill-color                     86
stem-root                      45
stem-surface                   87
stem-color                     88
veil-type                      24
veil-color                     27
has-ring                       26
ring-type                      47
spore-print-color              43
habitat                        65
season                         4


## 4. Cross-Validation(교차 검증)

In [7]:
X = train.drop(['class'], axis=1)
y = train['class'].map({'p': 0, 'e': 1})

In [8]:
%%time

params_xgb = {
    
    'enable_categorical': True,
    'tree_method': 'hist',
    'device': 'cuda',
    'n_estimators': 360,         
    'learning_rate': 0.1,           
    'max_depth': 17,                
    'colsample_bytree': 0.4,         
    'min_child_weight': 2,           
    'reg_lambda': 67,                
    'subsample': 0.98,              
    'num_parallel_tree': 4,
}

NUM_FOLDS = 5
val_scores = []
test_preds_xgb = []

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1)

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):  

    X_train, X_val = X.iloc[train_index], X.iloc[val_index] 
    y_train, y_val = y[train_index], y[val_index]    
        
    xgb = XGBClassifier(**params_xgb)   
   
    xgb.fit(X_train, y_train) 
      
    val_pred = xgb.predict(X_val) 
    mcc = matthews_corrcoef(y_val, val_pred )
    print(f'Fold {fold}: MCC = {mcc:.5f}')
    val_scores.append(mcc)
    
    test_preds_xgb.append(xgb.predict_proba(test))
    
test_preds_xgb = sum(test_preds_xgb)/len(test_preds_xgb) 

print(f'Mean Validation MCC= {np.mean(val_scores):.5f}')
print(f'Standard Deviation Validation MCC= {np.std(val_scores):.5f}')

Fold 0: MCC = 0.98504
Fold 1: MCC = 0.98479


## 5. Submission

In [None]:
pred = np.argmax(test_preds_xgb, axis=1) 
sample_submission['class'] = pd.Series(pred).map({0: 'p', 1: 'e'})
sample_submission.head() 

In [None]:
sample_submission.to_csv('submission.csv', index=False) 