In [156]:
import pandas as pd
import numpy as np
from category_encoders.one_hot import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from category_encoders.leave_one_out import LeaveOneOutEncoder
import re
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import matthews_corrcoef
import xgboost as xgb
import matplotlib as plt
# from xgboost.sklearn import XGBClassifier


warnings.simplefilter("ignore")
%matplotlib inline

In [157]:
dataset = pd.read_csv("train.csv")

In [158]:
def cleaning(df):

    threshold = 100

    cat_feats = ["cap-shape","cap-surface","cap-color","does-bruise-or-bleed","gill-attachment",
                 "gill-spacing","gill-color","stem-root","stem-surface","stem-color","veil-type",
                 "veil-color","has-ring","ring-type","spore-print-color","spore-print-color",
                 "habitat","season"]

    for feat in cat_feats:
        df[feat] = df[feat].fillna('missing')
        #df.loc[df[feat].value_counts(dropna=False)[df[feat]].values < threshold, feat] = "noise"
        df[feat] = df[feat].astype('category')

    return df

dataset  = cleaning(dataset)

In [159]:
X = dataset
X = X.drop(["class"], axis="columns")
y = dataset["class"]
y = y.map({"p":0, "e":1})

In [107]:
X

0          1
1          0
2          1
3          1
4          1
          ..
3116940    1
3116941    1
3116942    0
3116943    1
3116944    0
Name: class, Length: 3116945, dtype: int64

In [163]:
clf = xgb.XGBClassifier(learning_rate=0.02,
                        n_estimators=10,
                        enable_categorical=True,
                        objective="binary:logistic",
                        nthread=3,
                        tree_method="gpu_hist",
                        device="cuda")

In [111]:
clf = xgb.XGBClassifier(tree_method="auto", 
                        objective ='binary:logistic' , 
                        enable_categorical=True, 
                        min_sample = 5,
                        max_sample = 15,
                        #max_bin=128,
                        learning_rate= 0.1,
                        colsample_bytree=0.6,
                        #n_estimator = 500,
                        #gamma = 0.1,
                        device="cuda")

In [165]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [166]:
clf.fit(X_train, y_train)

In [167]:
y_test_pred = clf.predict(X_test)

In [168]:
print(f"Decision tree accuracy train: {accuracy_score(y_test_pred, y_test):.8f}")

Decision tree accuracy train: 0.92634245


In [169]:
print(round(matthews_corrcoef(y_test_pred, y_test),5))

0.8541


In [171]:
test = pd.read_csv("test.csv")
test  = cleaning(test)

X_test = test

In [172]:
pre_kaggle = pd.read_csv('predict_kaggle.csv')

In [173]:
pre_kaggle_0 = pre_kaggle['0']

In [174]:
pre_kaggle_0 = pre_kaggle_0.replace('p', 0)

In [175]:
pre_kaggle_0 = pre_kaggle_0.replace('e', 1)

In [176]:
y_pred = clf.predict(X_test)

In [177]:
print(f"Decision tree accuracy train: {accuracy_score(y_pred,pre_kaggle_0):.8f}")

Decision tree accuracy train: 0.53858103


In [178]:
print(round(matthews_corrcoef(y_pred,pre_kaggle_0),5))

-0.04151


In [191]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from datetime import datetime

In [189]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [179]:
params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
}

In [193]:
xgb = XGBClassifier(learning_rate=0.02, 
                    n_estimators=600, 
                    objective='binary:logistic',
                    enable_categorical=True,
                    silent=True, nthread=1)

In [None]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X, y)
timer(start_time) # timing ends here for "start_time" variable

Fitting 3 folds for each of 5 candidates, totalling 15 fits


Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

