## 1. 필요한 라이브러리 불러오기

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold 

## 2. Reading Train and Test Datasets

In [None]:
train = pd.read_csv('./data/train.csv', index_col='id')
test = pd.read_csv('./data/test.csv', index_col='id')
sample_submission = pd.read_csv('./data/sample_submission.csv')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


## 3. Preprocessing

In [None]:
initial_features = list(test.columns)

float_features = ['cap-diameter', 'stem-height', 'stem-width']
cat_features = [f for f in initial_features if f not in float_features]
for feature in initial_features:
    if feature in cat_features:
        dtype = pd.CategoricalDtype(categories=sorted(list(set(train[feature].dropna()) | set(test[feature].dropna()))),
                                    ordered=False)
        print(f"{feature:30} {len(dtype.categories)}")
    else:
        dtype = np.float32
    train[feature] = train[feature].astype(dtype)
    if True:
        test[feature] = test[feature].astype(dtype)

## 4. Cross-Validation

In [None]:
X = train.drop(['class'], axis=1)
y = train['class'].map({'p': 0, 'e': 1})

In [None]:
%%time

params_xgb = {
    
    'enable_categorical': True,
    'tree_method': 'hist',
    'device': 'cuda',
    'n_estimators': 360,         
    'learning_rate': 0.1,           
    'max_depth': 17,                
    'colsample_bytree': 0.4,         
    'min_child_weight': 2,           
    'reg_lambda': 67,                
    'subsample': 0.98,              
    'num_parallel_tree': 4,
}

NUM_FOLDS = 5
val_scores = []
test_preds_xgb = []

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1)

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):  

    X_train, X_val = X.iloc[train_index], X.iloc[val_index] 
    y_train, y_val = y[train_index], y[val_index]    
        
    xgb = XGBClassifier(**params_xgb)   
   
    xgb.fit(X_train, y_train) 
      
    val_pred = xgb.predict(X_val) 
    mcc = matthews_corrcoef(y_val, val_pred )
    print(f'Fold {fold}: MCC = {mcc:.5f}')
    val_scores.append(mcc)
    
    test_preds_xgb.append(xgb.predict_proba(test))
    
test_preds_xgb = sum(test_preds_xgb)/len(test_preds_xgb) 

print(f'Mean Validation MCC= {np.mean(val_scores):.5f}')
print(f'Standard Deviation Validation MCC= {np.std(val_scores):.5f}')

## 5. Submission

In [None]:
pred = np.argmax(test_preds_xgb, axis=1) 
sample_submission['class'] = pd.Series(pred).map({0: 'p', 1: 'e'})
sample_submission.head() 

In [None]:
sample_submission.to_csv('submission.csv', index=False) 