In [2]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [3]:
dataset = pd.read_csv("wbc.csv")
import numpy as np
dataset = dataset.replace('?',np.nan)
dataset['bare_nuclei'] = pd.to_numeric(dataset.bare_nuclei.fillna
                                  (dataset.bare_nuclei.median()))

In [4]:
X = dataset.drop([dataset.columns[-1]], axis=1)
y = dataset.jenis

In [5]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [6]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [7]:
print(model)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)


In [8]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [9]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 94.81%


In [10]:
from sklearn import metrics
metrics.confusion_matrix(y_test, y_pred)

array([[144,   8],
       [  4,  75]])

In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [12]:
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred)
print (('Akurasi: %.4f') % (accuracy))
print (('Precision: %.4f') % (precision[1]))
print (('Recall: %.4f') % (recall[1]))
print (('F1: %.4f') % (f1[1]))

Akurasi: 0.9481
Precision: 0.9036
Recall: 0.9494
F1: 0.9259


In [13]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [14]:
cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=seed)

In [15]:
from scipy.stats import randint, uniform

In [16]:
params_dist_grid = {
    'max_depth': [1, 2, 3, 4],
    'gamma': [0,0,1, 0.5, 1],
    # uniform discrete random distribution
    'learning_rate': uniform(), # gaussian distribution
    'subsample': uniform(), # gaussian distribution
    # gaussian distribution
}

In [17]:
params_fixed = {
    'objective': 'binary:logistic',
    'silent': 1
}

In [18]:
rs_grid = RandomizedSearchCV(
    estimator=XGBClassifier(**params_fixed, seed=seed),
    param_distributions=params_dist_grid,
    n_iter=10,
    cv=cv,
    scoring='accuracy',
    random_state=seed
)

In [19]:
rs_grid.fit(X, y)

RandomizedSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[2 2 ..., 4 4], n_folds=10, shuffle=True, random_state=7),
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=1, subsample=1),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_depth': [1, 2, 3, 4], 'gamma': [0, 0, 1, 0.5, 1], 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000009FF39E8>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000009FF3C18>},
          pre_dispatch='2*n_jobs', random_state=7, refit=True,
          return_train_score=True, scoring='accuracy', verbose=0)

In [20]:
rs_grid.grid_scores_



[mean: 0.95422, std: 0.02437, params: {'gamma': 1, 'learning_rate': 0.77991879224011462, 'max_depth': 4, 'subsample': 0.97822289707858245},
 mean: 0.96137, std: 0.02298, params: {'gamma': 1, 'learning_rate': 0.30801276518773935, 'max_depth': 4, 'subsample': 0.5011204636599379},
 mean: 0.95994, std: 0.01771, params: {'gamma': 1, 'learning_rate': 0.26843898010187117, 'max_depth': 4, 'subsample': 0.52776479482077054},
 mean: 0.94850, std: 0.02728, params: {'gamma': 0, 'learning_rate': 0.80373903610437547, 'max_depth': 1, 'subsample': 0.39294230987084167},
 mean: 0.95851, std: 0.02558, params: {'gamma': 1, 'learning_rate': 0.67231727324123325, 'max_depth': 1, 'subsample': 0.2133853535799155},
 mean: 0.95279, std: 0.03615, params: {'gamma': 0, 'learning_rate': 0.93120601968902172, 'max_depth': 2, 'subsample': 0.75076272786072573},
 mean: 0.95279, std: 0.03060, params: {'gamma': 0, 'learning_rate': 0.85129458628820531, 'max_depth': 2, 'subsample': 0.54848991923603041},
 mean: 0.95565, std: 0

In [21]:
rs_grid.best_estimator_

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=1, learning_rate=0.37238468938505898, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=1, subsample=0.34875637565471518)

In [22]:
rs_grid.best_params_

{'gamma': 1,
 'learning_rate': 0.37238468938505898,
 'max_depth': 4,
 'subsample': 0.34875637565471518}

In [23]:
rs_grid.best_score_

0.96280400572246061

In [24]:
model4=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=1, learning_rate=0.37238468938505898, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=1, subsample=0.34875637565471518)

In [25]:
model4.fit(X_train,y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=1, learning_rate=0.372384689385059, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=1, subsample=0.3487563756547152)

In [26]:
y_pred = model4.predict(X_test)
predictions = [round(value) for value in y_pred]

In [27]:
accuracy = accuracy_score(y_test, predictions)

In [28]:
accuracy = accuracy_score(y_test, predictions)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred)
print (('Akurasi: %.4f') % (accuracy))
print (('Precision: %.4f') % (precision[1]))
print (('Recall: %.4f') % (recall[1]))
print (('F1: %.4f') % (f1[1]))

Akurasi: 0.9437
Precision: 0.9125
Recall: 0.9241
F1: 0.9182


# model3 lebih baik dari model4

In [52]:
model3=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.30801276518773935, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=1, subsample=0.5011204636599379)

In [53]:
model3.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.30801276518773935, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=1, subsample=0.5011204636599379)

In [54]:
y_pred = model3.predict(X_test)
predictions = [round(value) for value in y_pred]

In [55]:
accuracy = accuracy_score(y_test, predictions)

In [56]:
accuracy = accuracy_score(y_test, predictions)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred)
print (('Akurasi: %.4f') % (accuracy))
print (('Precision: %.4f') % (precision[1]))
print (('Recall: %.4f') % (recall[1]))
print (('F1: %.4f') % (f1[1]))

Akurasi: 0.9481
Precision: 0.9136
Recall: 0.9367
F1: 0.9250
