# Chapter 14: Automate Machine Learning Workflows with Pipelines

## 14.2. Data Preparation and Modeling Pipeline

In [None]:
# Create a pipeline that standardizes the data then creates a model
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
# Tạo pipeline
estimators = [
    ('standardize', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis())
]
model = Pipeline(estimators)
# Đánh giá pipeline bằng cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
# In kết quả chính xác trung bình
print(f"Accuracy: {results.mean():.4f}")

Accuracy: 0.7670


## 14.3. Feature Extraction and Modeling Pipeline

In [None]:
# Create a pipeline that extracts features from the data then creates a model
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif  # Cần thiết cho SelectKBest
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
# Tạo feature union
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(score_func=f_classif, k=6)))  # cần định nghĩa score_func
feature_union = FeatureUnion(features)
# Tạo pipeline
estimators = []
estimators.append(('features', feature_union))
estimators.append(('logistic', LogisticRegression(solver='liblinear')))
model = Pipeline(estimators)
# Đánh giá pipeline
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print(f"Accuracy: {results.mean():.4f}")

Accuracy: 0.7722
