In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('./wineQualityReds.csv') #載入資料
df.shape #顯示(rows,columns)數

(1599, 13)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
numerics = ['int16', 'int32','int64', 'float16', 'float32', 'float64']
numerical_features = list(df.select_dtypes(include=numerics).columns)

In [5]:
data = df[numerical_features]

In [6]:
X = data.drop(['Unnamed: 0','quality'], axis=1) #刪除不需要的欄位
X.shape

(1599, 11)

In [7]:
y = data['quality']
y.shape

(1599,)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1729)
print(X_train.shape, X_test.shape)

(1071, 11) (528, 11)


In [38]:
gt = GradientBoostingClassifier(learning_rate = 0.05,
                                n_estimators = 200,
                                max_depth = 8
                               )

In [39]:
gt.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=8,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [40]:
y_pred = gt.predict(X_test)

In [41]:
y_pred

array([5, 6, 5, 6, 5, 5, 5, 6, 7, 6, 6, 6, 6, 7, 6, 5, 5, 6, 6, 5, 5, 7,
       6, 6, 6, 5, 6, 5, 5, 5, 6, 5, 5, 5, 6, 7, 5, 6, 5, 5, 6, 6, 5, 5,
       5, 6, 3, 6, 5, 5, 6, 7, 5, 6, 5, 7, 6, 6, 5, 7, 6, 6, 6, 5, 6, 6,
       5, 7, 6, 6, 6, 5, 6, 5, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 5, 6, 6,
       5, 5, 5, 5, 7, 5, 6, 6, 6, 6, 6, 6, 5, 5, 6, 5, 7, 5, 5, 6, 6, 6,
       7, 6, 6, 6, 7, 6, 5, 5, 6, 5, 4, 5, 6, 6, 6, 6, 5, 6, 6, 5, 6, 6,
       5, 5, 6, 6, 5, 5, 7, 6, 5, 6, 4, 6, 5, 5, 6, 5, 5, 7, 7, 5, 7, 6,
       5, 6, 5, 5, 7, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 5, 6, 5, 5, 5, 6, 4,
       6, 5, 6, 5, 4, 5, 5, 5, 5, 5, 4, 7, 5, 5, 6, 6, 5, 6, 5, 5, 5, 6,
       5, 6, 6, 7, 5, 5, 5, 5, 5, 6, 5, 5, 6, 5, 7, 5, 5, 6, 6, 5, 7, 5,
       7, 7, 6, 5, 7, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5, 6, 5, 6, 5, 6, 5, 6,
       7, 6, 5, 7, 6, 6, 7, 6, 6, 7, 5, 5, 6, 5, 5, 7, 5, 5, 5, 7, 5, 5,
       5, 5, 5, 5, 5, 5, 7, 5, 6, 6, 5, 6, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6,
       5, 5, 5, 5, 6, 5, 6, 5, 6, 7, 6, 6, 5, 5, 6,

In [42]:
gt.score(X_test, y_test)

0.6325757575757576

In [36]:
from sklearn.model_selection import GridSearchCV

In [37]:
estimator = GradientBoostingClassifier() 
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1], 
    'n_estimators': [20, 40, 100, 200],
    'max_depth':[3,8,10]
} 
gbrt = GridSearchCV(estimator, param_grid) 
gbrt.fit(X_train, y_train) 
print('Best parameters found by grid search are:', gbrt.best_params_)



Best parameters found by grid search are: {'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 200}
