In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('./wineQualityReds.csv') #載入資料
df.shape #顯示(rows,columns)數

(1599, 13)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
numerics = ['int16', 'int32','int64', 'float16', 'float32', 'float64']
numerical_features = list(df.select_dtypes(include=numerics).columns)

In [5]:
data = df[numerical_features]

In [6]:
X = data.drop(['Unnamed: 0','quality'], axis=1) #刪除不需要的欄位
X.shape

(1599, 11)

In [7]:
y = data['quality']
y.shape

(1599,)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1729)
print(X_train.shape, X_test.shape)

(1071, 11) (528, 11)


In [9]:
import xgboost as xgb

In [10]:
xlf = xgb.XGBClassifier(
    max_depth=10, 
    learning_rate=0.1, 
    n_estimators=100, 
    silent=True,  
    nthread=-1, 
    gamma=0,
    min_child_weight=1, 
    max_delta_step=0, 
    subsample=0.5, 
    colsample_bytree=0.8, 
    colsample_bylevel=1, 
    reg_alpha=0, 
    reg_lambda=1, 
    scale_pos_weight=1, 
    seed=1440, 
    missing=None
)
xlf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=50)

[0]	validation_0-merror:0.29972	validation_1-merror:0.448864
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 50 rounds.
[1]	validation_0-merror:0.235294	validation_1-merror:0.409091
[2]	validation_0-merror:0.222222	validation_1-merror:0.409091
[3]	validation_0-merror:0.194211	validation_1-merror:0.405303
[4]	validation_0-merror:0.19141	validation_1-merror:0.412879
[5]	validation_0-merror:0.180205	validation_1-merror:0.401515
[6]	validation_0-merror:0.168067	validation_1-merror:0.414773
[7]	validation_0-merror:0.15873	validation_1-merror:0.390152
[8]	validation_0-merror:0.148459	validation_1-merror:0.38447
[9]	validation_0-merror:0.142857	validation_1-merror:0.378788
[10]	validation_0-merror:0.13352	validation_1-merror:0.393939
[11]	validation_0-merror:0.13352	validation_1-merror:0.386364
[12]	validation_0-merror:0.130719	validation_1-merror:0.367424
[13]	validation_0-merror:0.126984	

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=-1, objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=1440, silent=True,
       subsample=0.5, verbosity=1)

In [11]:
# 計算 auc 分數、預測
y_pred = xlf.predict(X_test)

In [12]:
y_pred

array([5, 6, 5, 6, 5, 5, 5, 6, 6, 7, 6, 6, 6, 7, 6, 5, 5, 6, 6, 5, 5, 6,
       6, 6, 6, 5, 6, 5, 5, 5, 6, 5, 5, 5, 6, 7, 5, 6, 5, 5, 7, 6, 5, 5,
       6, 6, 5, 5, 5, 5, 6, 7, 5, 6, 5, 7, 6, 5, 5, 7, 6, 6, 6, 5, 6, 6,
       5, 7, 6, 7, 6, 5, 6, 5, 6, 6, 5, 5, 5, 5, 5, 6, 7, 6, 6, 5, 6, 6,
       5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 7, 5, 5, 6, 5, 7, 5, 5, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 5, 5, 6, 5, 6, 5, 6, 6, 6, 6, 5, 6, 6, 5, 5, 6,
       5, 5, 6, 6, 5, 5, 7, 6, 5, 6, 5, 6, 5, 5, 6, 5, 5, 6, 7, 5, 7, 6,
       5, 7, 5, 5, 7, 5, 6, 7, 6, 6, 6, 6, 5, 6, 6, 5, 6, 5, 5, 5, 5, 5,
       6, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 7, 5, 5, 6, 6, 5, 6, 6, 5, 5, 6,
       5, 6, 6, 7, 5, 5, 5, 5, 5, 6, 5, 5, 6, 4, 6, 5, 5, 6, 5, 5, 7, 5,
       6, 7, 6, 5, 6, 5, 5, 5, 6, 6, 6, 6, 5, 5, 6, 6, 5, 6, 5, 6, 5, 6,
       6, 6, 5, 6, 6, 6, 7, 6, 6, 6, 5, 5, 6, 5, 5, 7, 5, 5, 5, 6, 5, 5,
       5, 5, 5, 5, 5, 5, 7, 5, 6, 7, 5, 6, 6, 6, 6, 5, 6, 6, 5, 6, 5, 6,
       5, 5, 5, 5, 6, 6, 6, 5, 6, 6, 6, 5, 5, 5, 6,

In [13]:
predictions = [round(value) for value in y_pred]

In [14]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 66.86%


In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
estimator = xgb.XGBClassifier(
    max_depth=10,  
    n_estimators=10, 
    silent=True,  
    nthread=-1, 
    gamma=0,
    min_child_weight=1, 
    max_delta_step=0, 
    subsample=0.85,  
    colsample_bylevel=1, 
    reg_alpha=0, 
    reg_lambda=1, 
    scale_pos_weight=1, 
    seed=1440, 
    missing=None
) 
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1], 
    'n_estimators': [20, 40, 100],
    'subsample':[0.5, 0.8],
    'colsample_bytree':[0.5, 0.8],
    'colsample_bylevel':[0.5, 0.8, 1]
    } 
xgb = GridSearchCV(estimator, param_grid) 
xgb.fit(X_train, y_train) 
print('Best parameters found by grid search are:', xgb.best_params_)



Best parameters found by grid search are: {'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 0.5}
