# Chapter 15: Improve Performance with Ensembles

## 15.2. Bagging Algorithms

### 15.2.1. Bagged Decision Trees

In [None]:
# Bagged Decision Trees for Classification
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
# Cấu hình mô hình và cross-validation
seed = 7
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
# Tạo mô hình Bagging
model = BaggingClassifier(estimator=cart, n_estimators=num_trees, random_state=seed)
# Đánh giá mô hình
results = cross_val_score(model, X, Y, cv=kfold)
# In độ chính xác trung bình
print(f"Accuracy: {results.mean():.4f}")

Accuracy: 0.7578


### 15.2.2. Random Forest

In [3]:
# Random Forest Classification
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
num_trees = 100
max_features = 3
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7590909090909091


### 15.2.3. Extra Trees

In [4]:
# Extra Trees Classification
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
num_trees = 100
max_features = 7
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.753896103896104


## 15.3. Boosting Algorithms

### 15.3.1. AdaBoost

In [5]:
# AdaBoost Classification
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import AdaBoostClassifier
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
num_trees = 30
seed=7
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7552460697197538


### 15.3.2. Stochastic Gradient Boosting

In [6]:
# Stochastic Gradient Boosting Classification
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
seed = 7
num_trees = 100
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7578947368421053


## 15.4. Voting Ensemble

In [8]:
# Voting Ensemble for Classification
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
# Tạo kfold với shuffle để tránh lỗi warning
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
# Tạo các mô hình con
estimators = []
model1 = LogisticRegression(max_iter=1000)  # Thêm max_iter để tránh cảnh báo
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC(probability=True)  # Cần probability=True để VotingClassifier hoạt động ở chế độ soft
estimators.append(('svm', model3))
# Tạo mô hình ensemble (hard voting mặc định)
ensemble = VotingClassifier(estimators=estimators, voting='soft')  # hoặc voting='hard'
# Đánh giá mô hình
results = cross_val_score(ensemble, X, Y, cv=kfold)
# In kết quả
print(f"Accuracy: {results.mean():.4f}")

Accuracy: 0.7357
